#!/usr/bin/perl
#

# filter-out-iea.pl
# version: $Revision: 1.7 $
# date: $Date: 2011/06/03 15:35:00 $
#
# specification of the gene_association format is defined at:
#   http://www.geneontology.org/GO.annotation.html#file
#
# Requires Perl 5.6.1 or later
#
# Maintained by the SGD for the Gene Ontology Consortium
#  author: Mike Cherry (cherry@stanford.edu)
#          Anand Sethuraman (anand@genome.stanford.edu) updated July 2005
#
###############################################################################

use strict;

my $configfile;
###############################################################################

############ Get and check user passed in single character switches ##########
# process command line arguments
our $opt_h;
our $opt_i;
our $opt_o;

use Getopt::Std;

getopts('ho:i:p:x:');

# TRUE if the user wants the details report
# otherwise just the summary is provided
my $printhelp = defined($opt_h);
my $inputfile = "-";
my $outputfile = "-";

# check the passed in options
&check_options;
###############################################################################

################## Define variables to keep track of errors ###################
# current line of text
my $line = "";

# defined information about each column in the gene_association files
my @column = ();

# Column positions 
use constant DB => 0;
use constant DB_OBJECT_ID => 1;
use constant DB_OBJECT_SYMBOL => 2;
use constant QUALIFIER => 3;
use constant GOID => 4;
use constant REFERENCE => 5;
use constant EVIDENCE => 6;
use constant WITH => 7;
use constant ASPECT => 8;
use constant DB_OBJECT_NAME => 9;
use constant DB_OBJECT_SYNONYM => 10;
use constant DB_OBJECT_TYPE => 11;
use constant TAXON => 12;
use constant DATE => 13;
use constant ASSIGNED_BY => 14;
use constant ANNOTATION_XP => 15;         # ignored if GAF version < 2
use constant GENE_PRODUCT_ISOFORM => 16;  # ignored if GAF version < 2

# Number of TAB delimited columns in file
use constant COLNUM => 17;

# Definition of positions in column array
use constant LABEL => 0;
use constant CARDINAL => 1;
use constant SPECIAL => 2;
use constant CHECKDB => 3;
use constant DUBCOLON => 4;

### parse gene associations file
&parse_gene_assoc_file;

exit;

###############################################################################
################################           ####################################
################################ FUNCTIONS ####################################
################################           ####################################
###############################################################################

###############################################################################
sub check_options
###############################################################################
{
    if ($printhelp)
    {

	print STDERR <<END;

      Usage:  $0 [-h] [-o filename] [-i input file] 

	  -h displays this message
	  -i input from a GAF file, gzipped or compressed.
             STDIN is the default.
	  -o output to specified filename, output gzipped if filename ends with .gz
             STDOUT is the default.

	  examples:

	      check a file for any errors, obsolete GOIDs or old IEA annotations

		  % $0 -i gene_association.sgd.gz -o testing.sgd.output

END

    exit;

    }

    if ($opt_i)
    {
	$inputfile = $opt_i;
    }
	
    if ($opt_o)
    {
	$outputfile = $opt_o;
    }
	
}

###############################################################################
sub parse_gene_assoc_file
{
###############################################################################

    if  ( ($inputfile =~ /(.+)\.gz$/) || ($inputfile  =~ /(.+)\.Z$/) )
    {
	open (INPUT, "gzcat $inputfile |") || die "Cannot open gzipped input $inputfile for reading: $!\n";
    } 
    else
    {
	open (INPUT, $inputfile) || die "Cannot open input file $inputfile: $!\n";
    }
    
    if  ( $outputfile =~ /(.+)\.gz$/ )
    {
	open (OUTPUT, "| gzip > $outputfile") || die "Cannot open input file $outputfile: $!\n";
    }
    else
    {
	open (OUTPUT, ">$outputfile") || die "Cannot open input file $outputfile: $!\n";
    }

    # Begin input loop
    while ( defined($line = <INPUT>) )
    {
	chomp $line;

        # skip comment lines
	if ($line =~ m/^\!/)
	{
	    print OUTPUT "$line\n";
	    next;
	}

	# split TAB delimited columns
	my @cols = split(/\t/, $line);
	
	unless  ( $cols[EVIDENCE] eq "IEA" )
	{
	    print OUTPUT "$line\n";
	}
			
    } # end of loop over each column
	
}
    
close(INPUT);

###############################################################################

__END__

=head1 NAME

I<filter-out-iea.pl> - removes associations where IEA is specified

=head1 SYNOPSIS

=over

=item print usage

  filter-out-iea.pl -h

=back

=over

=item run checks on the specified gene association file

  filter-out-iea.pl -i gene_association.sgd.gz

=back

=over

=item run checks and provide details on all errors on GA file

  filter-out-iea.pl -i gene_association.tair.gz -d |& more

=back

=head1 DESCRIPTION

Check gene association file for check syntax, plus removes obsolete
GOIDs, IEA annotations that are older than one year, and annotations
that are provided by one of the MOD projects.  

=head1 ARGUMENTS

Arguments can control the input file, the project name, the level of
detail and whether the filtered results are output.

=over

=item -h

print usage message

=item -q

quiet mode, don't print final report to STDERR

=item -i

name of input gene association file.  The file can be compressed or
gzipped.  STDIN is the default.

Caveat: The project name is automatically determined from the name of
the gene association file.  When using STDIN for input you must use
the -p option to specify the project name, otherwize all rows will be
filtered out.

=head1 INPUT

The specification of the gene_association format is defined at:
http://www.geneontology.org/GO.annotation.html#file

=over

=item GA file column definitions

 0: DB, database contributing the file (always "SGD" for this file).
 1: DB_Object_ID, SGDID (SGD's unique identifier for genes and
    features).
 2: DB_Object_Symbol, see below
 3: Qualifier (optional), one or more of 'NOT', 'contributes_to',
    'colocalizes_with' as qualifier(s) for a GO annotation, when needed,
    multiples separated by pipe (|)
 4: GO ID, unique numeric identifier for the GO term
 5: DB:Reference(|DB:Reference), the reference associated with the GO
    annotation
 6: Evidence, the evidence code for the GO annotation
 7: With (or) From (optional), any With or From qualifier for the GO
    annotation
 8: Aspect, which ontology the GO term belongs (Function, Process or
    Component)
 9: DB_Object_Name(|Name) (optional), a name for the gene product in
    words, e.g. 'acid phosphatase'
10: DB_Object_Synonym(|Synonym) (optional), see below
11: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
12: taxon(|taxon), taxonomic identifier of species encoding gene
    product
13: Date, date GO annotation was defined in the format YYYYMMDD
14: Assigned_by, source of the annotation (always "SGD" for this file)

=item Config file format

 project_name=Saccharomyces Genome Database (SGD)
 contact_email=yeast-curator@yeastgenome.org
 project_url=http://www.yeastgenome.org/
 funding_source=NHGRI at US NIH, grant number 5-P41-HG001315
 email_report=yeast-curator@yeastgenome.org,cherry@stanford.edu

=back

=head1 OUTPUT

The default output using the -w output is a validated gene association
file on STDOUT. See the INPUT section for details on this format.
When using -r option, two output files will be creaed: .filtered.gz
and .report files.  See the INPUT section for config file format.

=head1 REASON ROWS WOULD BE REJECTED

The following is a brief summary of the common errors this script will find.

   1. Not the correct number of columns.
   2. Any leading or trailing spaces on any field.
   3. Cardinality does not match format specification.
   4. DB abbreviation is not one of the standard set used by the GO Consortium.
   5. Qualifier column can only include NOT, contributes_to or colocalizes_with
   6. One of the three aspects (ontologies) is stated for each line.
   7. Evidence code column needs to be present and one of the standard set.
   8. DB Object Type is one of the defined set.
   9. Stated Taxid is allowed for the particular project file.
  10. GOID is not obsolete.
  11. Date is in proper format.
  12. IEA annotations are less than one year old.

=head1 FUTURE ENHANCEMENTS

 Check GOID and Aspect column for consistency.

=cut

