#!/usr/bin/perl
#

use strict;
use FindBin qw($Bin);

############ Define GO CVS directory, OBO and abbreviations files #############
# copy $Bin variable as it will be changed in the next step
my $gocvsbase = $Bin;

# if GO CVS directory structure changed this will also need to change
$gocvsbase =~ s|/software/utilities||;

my $obofile = "$gocvsbase/ontology/gene_ontology_edit.obo";
my $abbsfile = "$gocvsbase/doc/GO.xrf_abbs";

# when a report is requested (-r switch), the report file plus a file containing all the 
# error-free lines in the gene-assoc will be written out to the following location
# Note: all the user submitted gene-assoc files will be under this directory too
my $configfile;
###############################################################################

############ Get and check user passed in single character switches ##########
# process command line arguments
our $opt_h;
our $opt_q;
our $opt_d;
our $opt_e;
our $opt_w;
our $opt_r;
our $opt_o;
our $opt_i;
our $opt_p;
our $opt_x;

use Getopt::Std;

getopts('hqdewro:i:p:x:');

# TRUE if the user wants the details report
# otherwise just the summary is provided
my $printhelp = defined($opt_h);
my $quietmode = defined($opt_q);
my $detail = defined($opt_d);
my $writebad = defined($opt_e);
my $writegood = defined($opt_w);
my $writereport = defined($opt_r);
my $inputfile = "-";
my $projectname = "";

# check the passed in options
&check_options;
###############################################################################

################## Define variables to keep track of errors ###################
# number of line in the input file
my $linenum = 0;

# current line of text
my $line = "";

# array of errors, column number is the index
my @errors = ();

# total errors, column specific errors and line errors
my $totalerr = 0;

# error detected on this line
my $errorfound = 0;

# errors with the whole line
my $lineerr = 0;

# total number of lines writing with the -w option
my $totallines = 0;

# total number of lines filtered out to make non-redundant
my $taxonfiltered = 0;

# defined information about each column in the gene_association files
my @column = ();

# Column positions 
use constant DB => 0;
use constant DB_OBJECT_ID => 1;
use constant DB_OBJECT_SYMBOL => 2;
use constant QUALIFIER => 3;
use constant GOID => 4;
use constant REFERENCE => 5;
use constant EVIDENCE => 6;
use constant WITH => 7;
use constant ASPECT => 8;
use constant DB_OBJECT_NAME => 9;
use constant DB_OBJECT_SYNONYM => 10;
use constant DB_OBJECT_TYPE => 11;
use constant TAXON => 12;
use constant DATE => 13;
use constant ASSIGNED_BY => 14;

# Number of TAB delimited columns in file
use constant COLNUM => 15;

# Definition of positions in column array
use constant LABEL => 0;
use constant CARDINAL => 1;
use constant SPECIAL => 2;
use constant CHECKDB => 3;
use constant DUBCOLON => 4;

# Define Column Information
&populate_column_array;

# Parse the abbreviations file and find defined abbreviations
my %abbrev;
&parse_abbs_file;

### store header from parsed gene association config file
my $gene_assoc_header = "";

### parse gene associations file
&parse_gene_assoc_file;

exit;

###############################################################################
################################           ####################################
################################ FUNCTIONS ####################################
################################           ####################################
###############################################################################

###############################################################################
sub check_options
###############################################################################
{
    if ($writegood && $writebad)
    {
	die "Unable to have both -w and -e on.  This would print both
good and bad lines to STDOUT, effectively just duplicating the input
file.\nExiting now.\n\n";

    }

    if ($printhelp)
    {

	print STDERR <<END;

      Usage:  $0 [-h] [-q] [-d] [-w] [-r] [-o filename] [-i input file] [-p project]

	  -h displays this message
	  -q quiet mode
	  -i input to present a standard file, gzipped, or compressed file as input.
             STDIN is the default.
          -d switches to a line by line report of errors identified on STDERR
          -e write all "bad lines" to STDOUT
          -w write all "good lines" to STDOUT
          -r write e-mail report and file of all "good lines" to files
          -o alternative path to the gene_ontology_edit.obo file.
          -p force project name
             to turn off taxid check state project name as nocheck
          -x alternative path to GO.xrf_abbs file

	  examples:

	      check a file for any errors, obsolete GOIDs or old IEA annotations

		  % $0 -i gene_association.sgd.gz

	      filter any problems and output the validated lines, including headers

		  % $0 -i gene_association.fb.gz -w > filtered-output

	      check file without the taxid checking on, and write the bad lines to STDOUT

		  % $0 -i gene_association.fb.gz -p nocheck -e > bad-lines

END

    exit;

    }

    if ($opt_o)
    {
	$obofile = $opt_o;
    }
	
    if ($opt_x)
    {
	$abbsfile = $opt_x;
    }
	
    if ($opt_i)
    {
	$inputfile = $opt_i;
	$projectname = $inputfile;
	$projectname =~ s/.*gene_association\.//;
	$projectname =~ s/\.gz//;
    }
	
    if ($opt_p)
    {
	$projectname = $opt_p;
	print STDERR "Project Name set to $projectname\n" if ($detail);
    }
	
    print STDERR "Input filename = $inputfile\n" if ($detail);
    print STDERR "Abbrev. filename = $abbsfile\n" if ($detail);
    print STDERR "Project Name = $projectname\n" if ($detail);
	
    if ( lc($opt_p) eq 'nocheck' )
    {
	$projectname = "";
    }

}

###############################################################################
sub populate_column_array
{
###############################################################################
# Column information:  Name (LABEL), Check Cardinality (CARDINAL), 
#  Special Check Included (SPECIAL), CheckDB name (CHECKDB), 
#  Check for double colon '::' in IDs (DUBCOLON)
# Cardinality; if 0 =
# Special check included; if 0 = FALSE or 1 = TRUE

    $column[DB] = ['DB', 1, 1, 0, 0];
    $column[DB_OBJECT_ID] = ['DB_Object_ID', 1, 1, 0, 1];
    $column[DB_OBJECT_SYMBOL] = ['DB_Object_Symbol', 1, 1, 0, 0];
    $column[QUALIFIER] = ['Qualifier', 0, 1, 0, 0];
    $column[GOID] = ['GOID', 1, 1, 0, 1];
    $column[REFERENCE] = ['DB:Reference', 2, 1, 1, 1];
    $column[EVIDENCE] = ['Evidence', 1, 1, 0, 0];
    $column[WITH] = ['With', 0, 1, 1, 1];
    $column[ASPECT] = ['Aspect', 1, 1, 0, 0];
    $column[DB_OBJECT_NAME] = ['DB_Object_Name', 0, 1, 0, 0];
    $column[DB_OBJECT_SYNONYM] = ['DB_Object_Synonym', 0, 1, 0, 0];
    $column[DB_OBJECT_TYPE] = ['DB_Object_Type', 1, 1, 0, 0];
    $column[TAXON] = ['Taxon', 2, 1, 0, 1];
    $column[DATE] = ['Date', 1, 1, 0, 0];
    $column[ASSIGNED_BY] = ['Assigned_by', 1, 1, 0, 0];

}

###############################################################################
sub parse_abbs_file
###############################################################################
{

    my $cntabbs = 0;
    my $cntsyn = 0;
    my $primaryname = "";

    open (ABBS, $abbsfile) || die "Cannot open file $abbsfile: $!\n";

    while ( <ABBS> )
    {
	chomp;

	if (/^abbreviation: (\S+)/)
	{
	    $primaryname = $1;
	    $abbrev{ lc($1) }->{TRUE} = 1;
	    $abbrev{ lc($1) }->{NAME} = $1;
	    $abbrev{ lc($1) }->{PRIMARY} = 1;
	    $cntabbs++;
	}

	if (/^synonym: (\S+)/)
	{
	    my $foundstring = $1;

	    unless ( defined($abbrev{ lc($1) }->{TRUE} ))
	    {
		$abbrev{ lc($1) }->{NAME} = $primaryname;
		$abbrev{ lc($1) }->{PRIMARY} = 0;
	    }
	    $cntsyn++;
	}
    }

    close (ABBS);

    print STDERR "Read $cntabbs abbreviations and $cntsyn synonyms from $abbsfile\n\n" if ($detail);

}

###############################################################################
sub parse_gene_assoc_config_file
{
###############################################################################

    my ($base_file_name) = @_;  ### eg: gene_association.sgd

    my %gene_assoc_meta_data = ();

    $configfile = "${base_file_name}.conf";

    open (META, $configfile) || die "Cannot open file $configfile for reading: $!\n";

    while ( <META> )
    {
	chomp;

	if (/^project\_name=(.+)$/)
	{
	    next if ($1 eq 'unspecified');
	    $gene_assoc_meta_data{ "A" } = "!Project_name: $1\n";	    
	}
	elsif (/^project\_url=(.+)$/)
	{
	    next if ($1 eq 'unspecified');
	    $gene_assoc_meta_data{ "B" } = "!URL: $1\n";
	}
	elsif (/^contact\_email=(.+)$/)
	{
	    next if ($1 eq 'unspecified');
	    $gene_assoc_meta_data{ "C" } = "!Contact Email: $1\n";
	}
	elsif (/^funding\_source=(.+)$/)
	{
	    next if ($1 eq 'unspecified');
	    $gene_assoc_meta_data{ "D" } = "!Funding: $1\n";
	}
	else
	{
	    next;
	}
    }

    close (META);

    foreach my $confinfo (sort keys %gene_assoc_meta_data)
    {
	$gene_assoc_header .= $gene_assoc_meta_data{ $confinfo };
    }

    $gene_assoc_header .= "!\n";

}

###############################################################################
sub parse_gene_assoc_file
{
###############################################################################

    my $base_file_name = "";
    my $dirpath = "/tmp/";
    my $savedirpath = "";

    if  ( ($inputfile =~ /(.+)\.gz$/) || ($inputfile  =~ /(.+)\.Z$/) )
    {
	open (INPUT, "gzcat $inputfile |") || die "Cannot open gzipped input $inputfile for reading: $!\n";
	$base_file_name = $1;
    } 
    else
    {
	open (INPUT, $inputfile) || die "Cannot open input file $inputfile: $!\n";
	$base_file_name = $inputfile;
    }
    
    if ($writereport)
    {

	&parse_gene_assoc_config_file($base_file_name);

	if ( $base_file_name =~ /(^.*\/)(gene_association\..*)/ )
	{
	    $savedirpath = $1;
	    $base_file_name = $2;
	}

	unless ( -d $dirpath )
	{
	    # /tmp directory does not exist so use the current working directory for temp file
	    $dirpath = $savedirpath;
	}

	open (FILTER, "| /usr/bin/gzip > ${dirpath}${base_file_name}.filtered.gz") || die "Cannot write gzipped output ${dirpath}${base_file_name}.filtered.gz: $!\n";
	print FILTER "$gene_assoc_header";
    }

    my $headerCount = 1;

    # Begin input loop
    while ( defined($line = <INPUT>) )
    {
	$linenum++;
	$errorfound = 0;
	
	unless ( $line =~ /.*\n/ )
	{
	    &checkwarn ("$linenum: No end of line character, the last line of the file is probably missing a return character\n");
	    $lineerr++;
	}
	
	chomp $line;
	
        # skip comment lines
	if ($line =~ m/^\!/)
	{
	    print "$line\n" if ($writegood);
	    next;
	}

	# blank line?
	if ( $line eq "" )
	{
	    &checkwarn ("$linenum: BLANK line, these should be deleted or start with an \'\!\'\n");
	    $lineerr++;
	    next;
	}
	
	# split TAB delimited columns
	my @cols = split(/\t/, $line);
	
	unless ( scalar @cols == COLNUM )
	{
	    &checkwarn ("$linenum: Too few or too many columns on this line, found " . scalar @cols . ". There should be " . COLNUM . ". Line skipped.\n");
	    # increment error counters
	    $lineerr++;
	    next;
	}
	
	# loop through all the columns on this line of input
	for (my $cnum=0; $cnum < @column; $cnum++)
	{

	    # Specific Checks
	    # Was a valid DB abbreviation used
	    if ($cnum == DB)
	    {
		if ( $abbrev{ lc($cols[DB]) }->{PRIMARY} == 0 )
		{
		    &checkwarn ("$linenum: " . $column[DB][LABEL] . " column=" . (DB + 1) . " database synonym used, found \"" . $cols[DB] . "\" should be \"" . $abbrev{ lc($cols[DB]) }->{NAME} . "\"\n", $cnum);
		}
		else
		{
		    if ( $abbrev{ lc($cols[DB]) }->{NAME} ne $cols[DB] )
		    {
			&checkwarn ("$linenum: " . $column[DB][LABEL] . " column=" . (DB + 1) . " Abbreviation not standard case usage, found \"" . $cols[DB] . "\" should be \"" . $abbrev{ lc($cols[DB]) }->{NAME} . "\"\n", $cnum);
		    }
		}
	    }
		
	    # If GOID in WITH column is it valid, also check if a valid abbreviation is used
	    if ($cnum == WITH && $cols[WITH] ne "")
	    {
		my @field = split(/\|/, $cols[WITH]);
		foreach my $value (@field)
		{
		    my @tmpabbrev = split(/:/, lc($value));
		    my @tmpabbrevfullcase = split(/:/, $value);
		    
		    if ( $abbrev{ $tmpabbrev[0] }->{PRIMARY} == 0 )
		    {
			&checkwarn ("$linenum: " . $column[WITH][LABEL] . " column=" . (WITH + 1) . " database synonym used, found \"" . $tmpabbrevfullcase[0] . "\" should be \"" . $abbrev{ $tmpabbrev[0] }->{NAME} . "\"\n", $cnum);
		    }
		    else
		    {
			if ( $abbrev{ $tmpabbrev[0] }->{NAME} ne $tmpabbrevfullcase[0] )
			{
			    &checkwarn ("$linenum: " . $column[WITH][LABEL] . " column=" . (WITH + 1) . " Abbreviation not standard case usage, found \"" . $tmpabbrevfullcase[0] . "\" should be \"" . $abbrev{ $tmpabbrev[0] }->{NAME} . "\"\n", $cnum);
			}
		    }
		}
	    }
		
	    if ($cnum == ASSIGNED_BY)
	    {
		if ( $abbrev{ lc($cols[ASSIGNED_BY]) }->{PRIMARY} == 0 )
		{
		    &checkwarn ("$linenum: " . $column[ASSIGNED_BY][LABEL] . " column=" . (ASSIGNED_BY + 1) . " database synonym used, found \"" . $cols[$cnum] . "\" should be \"" . $abbrev{ lc($cols[ASSIGNED_BY]) }->{NAME} . "\"\n", $cnum);
		}
		else
		{
		    if ( $abbrev{ lc($cols[ASSIGNED_BY]) }->{NAME} ne $cols[ASSIGNED_BY] )
		    {
			&checkwarn ("$linenum: " . $column[ASSIGNED_BY][LABEL] . " column=" . (ASSIGNED_BY + 1) . " Abbreviation not standard case usage, found \"" . $cols[ASSIGNED_BY] . "\" should be \"" . $abbrev{ lc($cols[ASSIGNED_BY]) }->{NAME} . "\"\n", $cnum);
		    }
		}
	    } # end of special condition
	    
	    if ($column[$cnum][CHECKDB])
	    {

		# CHECK the DB part of an identifier
		if ( $cols[$cnum] =~ /\:/ )
		{
		    my @field = split(/\|/, $cols[$cnum]);
		    foreach my $tmpfld ( @field )
		    {
			if ( $tmpfld =~ /\:/ )
			{
			    my ( @dbname ) = split(/\:/, $tmpfld);
			    if ( ( $abbrev{ lc($dbname[0]) }->{PRIMARY} == 0 ) &&
				 ( $cnum != WITH) )
			    {
				&checkwarn ("$linenum: " . $column[$cnum][LABEL] . " column=" . ($cnum + 1) . " database synonym used, found \"" . $dbname[0] . "\" should be \"" . $abbrev{ lc($dbname[0]) }->{NAME} . "\"\n", $cnum);
			    }
			    else
			    {
				if ( ( $abbrev{ lc($dbname[0]) }->{NAME} ne $dbname[0] ) &&
				     ( $cnum != WITH ) )
				{
				    &checkwarn ("$linenum: " . $column[$cnum][LABEL] . " column=" . ($cnum + 1) . " Abbreviation not standard case usage, found \"" . $dbname[0] . "\" should be \"" . $abbrev{ lc($dbname[0]) }->{NAME} . "\"\n", $cnum);
				}
			    }
			}
		    }
		}
	    }
	} # end of loop over each column
	
	unless ($errorfound > 0)
	{
	    print FILTER "$line\n" if ($writereport);
	    print "$line\n" if ($writegood);
	    $totallines++;
	} 
	elsif ($writebad)
	{
	    print "$line\n";
	}
       
    }
    
    close(INPUT);

# assume TAB = 8 spaces
    use constant TABWIDTH => 8;
    
    my $report;
    
    if ( ($totalerr > 0) || ($taxonfiltered > 0) )
    {
	$report = "\nNUMBER of ERRORS by COLUMN\n\n";
	$report .= "Column\t\t\tCol#\tNumber of Errors\n";
	for (my $index=0; $index < @errors; $index++)
	{
	    if ($errors[$index] > 0)
	    {
		if (length($column[$index][LABEL]) < TABWIDTH)
		{
		    $column[$index][LABEL] .= "\t";
		}
		if (length($column[$index][LABEL]) < (TABWIDTH * 2))
		{
		    $column[$index][LABEL] .= "\t";
		}
		$report .= $column[$index][LABEL] . "\t" . ($index + 1) . "\t" . $errors[$index] . "\n";
	    }
	}
	$report .= "General errors\t\t-\t" . $lineerr . "\n" if ($lineerr > 0);
	$report .= "\nTOTAL ERRORS = " . $totalerr . "\n";
	$report .= "TOTAL ROWS with no issues = " . $totallines . "\n";

	if ($writereport && ($totallines == 0))
	{
	    print FILTER "! All Gene Associations in this file have been removed by the GO Consortium.\n!\n! The submitted associations most likely stated an NCBI Taxonomy Identifier\n! for each association that is available from another GO member project.\n! The GO Consortium started filtering gene association files in October 2005\n! in an effort to minimize confusion resulting in redundancy between the\n! many projects providing gene association files. At that time the Consortium\n! also started removing associations to obsolete GOIDs, IEA annotations older than\n! one year, and any association that did not meet the syntax defined for this file.\n!\n";

	    close(FILTER);
	}

    }
    else
    {
	$report .= "\nCongratulations, there are no errors.\n\n";
    }
    
    if ($opt_w)
    {
	$report .= "Total of $totallines lines (not including header) written to STDOUT.\n\n";
    }
    
    if ($writereport)
    {
	unless ($report =~ m/Congratulations/i)
	{
	    &write_report($report, $dirpath, $base_file_name);
	}
    }
    print STDERR $report unless ($quietmode);

}

###############################################################################
sub checkwarn
{
###############################################################################
# print each error if $detail equals 1
    my ($errortext, $colnum) = @_;

    print STDERR $errortext if ($detail);
    print STDOUT "\n" . $errortext if ($writebad);

    unless ($errorfound)
    {
	if ($colnum == 100+TAXON)
	{
	    $taxonfiltered++;
	    $errorfound = 1;
	} else
	{
	    $errors[$colnum]++;
	    $totalerr++;
	    $errorfound = 1;
	}
    }
}

###############################################################################
sub write_report
{
###############################################################################

    my ($report, $report_path, $report_file_name) = @_;

    $report_file_name = "${report_path}${report_file_name}.report";
    my $gafilename = $inputfile;
    $gafilename =~ s|.*/submission/||;

    my $body1 = "Please review the errors summarized in this report and fix your \ngene-associations file as is appropriate. This is an automated message \nsummarizing results of the GOC filtering for file:\n\n$gafilename\n";

    my $body2 = "To review a report of the errors use the following command from the \ngene-associations/submission directory:\n\n  $0 -d -i $gafilename\n\nFor a complete report including the bad rows use this command:\n\n  $0 -e -i $gafilename\n\n";

    my $body3 = "Your gene_association file with all errors removed is now available\nfrom the gene-associations directory at the geneontology.org web,\nFTP and CVS sites. Your email is defined as the address where these\nreports should be sent. If this is not correct please have the conf\nfile updated. If you have any questions or suggestions, please do not\nhesitate to contact me.\n\n";

    open (REPORT, ">${report_file_name}") || die "Cannot write to ${report_file_name}: $!\n";
    print REPORT "Dear Colleague,\n"; 
    print REPORT "\n";
    print REPORT "$body1";
    print REPORT "\n---$report---\n\n";
    print REPORT "$body2";
    print REPORT "$body3";
    print REPORT "Mike Cherry\n";
    print REPORT "E-mail: cherry\@stanford.edu\n";
    close(REPORT);

}

###############################################################################
sub _decon
{
###############################################################################
# decode greek SGML into text

    my ($string) = $_[0];

# fix for two problems in FB file
# can remove with the FB file is fixed
    $string =~ s/\\\&bgr\\\;/&bgr;/g;
    $string =~ s/&g-Tub37C/&ggr;-Tub37C/g;

    $string =~ s/&agr\;/alpha/g;
    $string =~ s/&Agr\;/Alpha/g;
    $string =~ s/&bgr\;/beta/g;
    $string =~ s/&Bgr\;/Beta/g;
    $string =~ s/&ggr\;/gamma/g;
    $string =~ s/&Ggr\;/Gamma/g;
    $string =~ s/&dgr\;/delta/g;
    $string =~ s/&Dgr\;/Delta/g;
    $string =~ s/&egr\;/epsilon/g;
    $string =~ s/&Egr\;/Epsilon/g;
    $string =~ s/&zgr\;/zeta/g;
    $string =~ s/&Zgr\;/Zeta/g;
    $string =~ s/&eegr\;/eta/g;
    $string =~ s/&EEgr\;/Eta/g;
    $string =~ s/&thgr\;/theta/g;
    $string =~ s/&THgr\;/Theta/g;
    $string =~ s/&igr\;/iota/g;
    $string =~ s/&Igr\;/Iota/g;
    $string =~ s/&kgr\;/kappa/g;
    $string =~ s/&Kgr\;/Kappa/g;
    $string =~ s/&lgr\;/lambda/g;
    $string =~ s/&Lgr\;/Lambda/g;
    $string =~ s/&mgr\;/mu/g;
    $string =~ s/&Mgr\;/Mu/g;
    $string =~ s/&ngr\;/nu/g;
    $string =~ s/&Ngr\;/Nu/g;
    $string =~ s/&xgr\;/xi/g;
    $string =~ s/&Xgr\;/Xi/g;
    $string =~ s/&ogr\;/omicron/g;
    $string =~ s/&Ogr\;/Omicron/g;
    $string =~ s/&pgr\;/pi/g;
    $string =~ s/&Pgr\;/Pi/g;
    $string =~ s/&rgr\;/rho/g;
    $string =~ s/&Rgr\;/Rho/g;
    $string =~ s/&sgr\;/sigma/g;
    $string =~ s/&Sgr\;/Sigma/g;
    $string =~ s/&tgr\;/tau/g;
    $string =~ s/&Tgr\;/Tau/g;
    $string =~ s/&ugr\;/upsilon/g;
    $string =~ s/&Ugr\;/Upsilon/g;
    $string =~ s/&phgr\;/phi/g;
    $string =~ s/&PHgr\;/Phi/g;
    $string =~ s/&khgr\;/chi/g;
    $string =~ s/&KHgr\;/Chi/g;
    $string =~ s/&psgr\;/psi/g;
    $string =~ s/&PSgr\;/Psi/g;
    $string =~ s/&ohgr\;/omega/g;
    $string =~ s/&OHgr\;/Omega/g;
    $string =~ s/\<\/down\>/\]\]/g;
    $string =~ s/\<down\>/\[\[/g;
    $string =~ s/\<up\>/\[/g;
    $string =~ s/\<\/up\>/\]/g;

    return ($string);
}

###############################################################################
sub _changerootid
{
###############################################################################
# change unknown GOIDs to ROOT GOIDs

    my ($string) = $_[0];

# Cellular Component
# GO:0008372  -->  GO:0005575
#
    $string =~ s/GO\:0008372/GO:0005575/g;

# Molecular Function
# GO:0005554  -->  GO:0003674
#
    $string =~ s/GO\:0005554/GO:0003674/g;

# Biological Process
# GO:0000004  -->  GO:0008150
#
    $string =~ s/GO\:0000004/GO:0008150/g;

    return ($string);
}

###############################################################################

__END__

=head1 NAME

I<check-abbr-ga-file.pl> - checks GO gene association file format and data

=head1 SYNOPSIS

=over

=item print usage

  check-abbr-ga-file.pl -h

=back

=over

=item run checks on the specified gene association file

  check-abbr-ga-file.pl -i gene_association.sgd.gz

=back

=over

=item run checks and provide details on all errors on GA file

  check-abbr-ga-file.pl -i gene_association.tair.gz -d |& more

=back

=over

=item filter out lines with errors and output validated lines to STDOUT

  check-abbr-ga-file.pl -i gene_association.fb.gz -w > filtered-output

=back

=head1 DESCRIPTION

Check gene association file for check syntax, plus removes obsolete
GOIDs, IEA annotations that are older than one year, and annotations
that are provided by one of the MOD projects.  

=head1 ARGUMENTS

Arguments can control the input file, the project name, the level of
detail and whether the filtered results are output.

=over

=item -h

print usage message

=item -q

quiet mode, don't print final report to STDERR

=item -i

name of input gene association file.  The file can be compressed or
gzipped.  To specify STDIN use "-i -".

Caveat: The project name is automatically determined from the name of
the gene association file.  When using STDIN for input you must use
the -p option to specify the project name, otherwize all rows will be
filtered out.

=item -d

turn on detailed output.  Each error, if any, are output to STDERR.
The line number within the input file and a description of the type of
error(s) are provided.

Caveat: The details are set to STDERR.  If you wish to view the errors
with a paging program such as more you will need to use "|&" instead
of the normal pipe symbol "|".  Normally only STDOUT is set through a
pipe.  Adding the ampersand will will send both STDOUT and STDERR to
through the pipe.

=item -e

Output each bad line to STDOUT.  The line number within the input file
and a description of the type of error(s) are provided.

=item -w

write validated lines, including header lines, to STDOUT.  You can use
the -d (detailed listing of errors and statistics) with the -w option.
The errors, if any will be displayed on STDERR and the validated lines
will be set to STDOUT.  If any error in format or data is identified
for a line it will not be sent to the output.

=item -r

creates two files in the submission directory: .filtered.gz and
.report files; the .filtered.gz file has all the error-free lines from
the gene_association file and the .report file has a summary of the
errors found in the MOD sumitted gene_association file.  When writing
out the .filetred.new file, the scripts uses the contents of .conf
file for that particular gene association file to create its header
section.  For more information about the format of .conf file, please
see the INPUT section below.

=item -o

full name to OBO file.  The default is
$gocvsbase/ontology/gene_ontology_edit.obo, as if you running this script
from within the gene-associations directory in your GO CVS sandbox.
You can use any file in OBO format, the obsolete GOIDs are identified
by the "is_obsolete: true" line.

=item -p

used to define the project name.  A specific project name is required
for the species filtering.  This option takes precedent over the
automatic project name determination that uses the input file name.
The -p option is required if using STDIN to provide the gene
association file.  List of project names and taxids.  Each of these
taxids is only allowed within the defined project specific file.  All
other taxids are allowed without restriction.

NOTE: To turn off the taxid checking use the -p option and specify the
name as "nocheck".

  PROJECT NAME          TAXID
  ====================  ============
  cgd                   taxon:5476
  dictyBase             taxon:5782, taxon:44689, taxon:352472, taxon:366501
  fb                    taxon:7227
  GeneDB_Lmajor         taxon:5664
  GeneDB_Pfalciparum    taxon:5833
  GeneDB_Spombe         taxon:4896
  GeneDB_Tbrucei        taxon:185431
  GeneDB_tsetse         taxon:37546
  goa_chicken           taxon:9031, taxon:208524, taxon:208525, taxon:208526, taxon:400035
  goa_cow               taxon:9913, taxon:297284, taxon:30523
  goa_human             taxon:9606
  gramene_oryza         taxon:4528, taxon:4529, taxon:4530, taxon:4532, taxon:4533,
                        taxon:4534, taxon:4535, taxon:4536, taxon:4537,
                        taxon:4538, taxon:4539, taxon:29689, taxon:29690,
                        taxon:39946, taxon:39947, taxon:40148, taxon:40149,
                        taxon:52545, taxon:63629, taxon:65489, taxon:65491,
                        taxon:77588, taxon:83307, taxon:83308, taxon:83309,
                        taxon:110450, taxon:110451, taxon:127571, taxon:364099, taxon:364100
  mgi                   taxon:10090
  rgd                   taxon:10116
  sgd                   taxon:4932, taxon:41870, taxon:285006, taxon:307796
  tair                  taxon:3702
  jcvi_Aphagocytophilum	taxon:212042
  jcvi_Banthracis       taxon:198094
  jcvi_Cburnetii        taxon:227377
  jcvi_Chydrogenoformans	taxon:246194
  jcvi_Cjejuni          taxon:195099
  jcvi_Cperfringens	taxon:195103
  jcvi_Cpsychrerythraea	taxon:167879
  jcvi_Dethenogenes     taxon:243164
  jcvi_Echaffeensis	taxon:205920
  jcvi_Gsulfurreducens  taxon:243231
  jcvi_Hneptunium	taxon:228405
  jcvi_Lmonocytogenes   taxon:265669
  jcvi_Mcapsulatus      taxon:243233
  jcvi_Nsennetsu	taxon:222891
  jcvi_Pfluorescens	taxon:220664
  jcvi_Psyringae        taxon:223283
  jcvi_Psyringae_phaseolicola	taxon:264730
  jcvi_Soneidensis      taxon:211586
  jcvi_Spomeroyi        taxon:246200
  jcvi_Tbrucei_chr2     taxon:5691
  jcvi_Vcholerae        taxon:686
  wb                    taxon:6239
  zfin                  taxon:7955

=item -x

full name to GO abbreviation file.  The default is $gocvsbase/doc/GO.xrf_abbs,
as if you running this script from within the gene-associations
directory in your GO CVS sandbox.  You can use any file in a similar
form as the GO.xrf_abbs file in the GO CVS.

=back

=head1 INPUT

The specification of the gene_association format is defined at:
http://www.geneontology.org/GO.annotation.html#file

=over

=item GA file column definitions

 0: DB, database contributing the file (always "SGD" for this file).
 1: DB_Object_ID, SGDID (SGD's unique identifier for genes and
    features).
 2: DB_Object_Symbol, see below
 3: Qualifier (optional), one or more of 'NOT', 'contributes_to',
    'colocalizes_with' as qualifier(s) for a GO annotation, when needed,
    multiples separated by pipe (|)
 4: GO ID, unique numeric identifier for the GO term
 5: DB:Reference(|DB:Reference), the reference associated with the GO
    annotation
 6: Evidence, the evidence code for the GO annotation
 7: With (or) From (optional), any With or From qualifier for the GO
    annotation
 8: Aspect, which ontology the GO term belongs (Function, Process or
    Component)
 9: DB_Object_Name(|Name) (optional), a name for the gene product in
    words, e.g. 'acid phosphatase'
10: DB_Object_Synonym(|Synonym) (optional), see below
11: DB_Object_Type, type of object annotated, e.g. gene, protein, etc.
12: taxon(|taxon), taxonomic identifier of species encoding gene
    product
13: Date, date GO annotation was defined in the format YYYYMMDD
14: Assigned_by, source of the annotation (always "SGD" for this file)

=item Config file format

 project_name=Saccharomyces Genome Database (SGD)
 contact_email=yeast-curator@yeastgenome.org
 project_url=http://www.yeastgenome.org/
 funding_source=NHGRI at US NIH, grant number 5-P41-HG001315
 email_report=yeast-curator@yeastgenome.org,cherry@stanford.edu

=back

=head1 OUTPUT

The default output using the -w output is a validated gene association
file on STDOUT. See the INPUT section for details on this format.
When using -r option, two output files will be creaed: .filtered.gz
and .report files.  See the INPUT section for config file format.

=head1 REASON ROWS WOULD BE REJECTED

The following is a brief summary of the common errors this script will find.

   1. Not the correct number of columns.
   2. Any leading or trailing spaces on any field.
   3. Cardinality does not match format specification.
   4. DB abbreviation is not one of the standard set used by the GO Consortium.
   5. Qualifier column can only include NOT, contributes_to or colocalizes_with
   6. One of the three aspects (ontologies) is stated for each line.
   7. Evidence code column needs to be present and one of the standard set.
   8. DB Object Type is one of the defined set.
   9. Stated Taxid is allowed for the particular project file.
  10. GOID is not obsolete.
  11. Date is in proper format.
  12. IEA annotations are less than one year old.

=head1 FUTURE ENHANCEMENTS

 Check GOID and Aspect column for consistency.

=cut

