#!usr/bin/perl -w
#
# J. Clark, February 2005	
# 
# This programme sorts the annotations into type according to which species, which
# database and IEA/notIEA. 
# The output file has one line for each annotation of each type.
# 
# Before running the programme please put all the gene association files in the same directory as the perl script
# and run these two commands:
#
# gunzip gene_association.*
# cat gene_association.* > annotationfile.txt
#
#
# After running the program you need to give this command in the terminal:
#
# cat output.txt | sort | uniq -c > sorted.txt
#
#
#
#The result will look like this:
#
#   3 IEA annotations from CGEN for species with taxon:100.
#   4 IEA annotations from CGEN for species with taxon:100054.
#  16 IEA annotations from CGEN for species with taxon:10007.
#   3 IEA annotations from CGEN for species with taxon:100115.
#
#


use strict;


#for storing lines in later
my @columnsarray;
my $line;
my $annot; 
my $taxon; 
my @lines;
my $i;
my $allannots;
my $output = "output.txt";
my $annotationfile = "annotationfile.txt";

    



#	create a new annots file for writing
print "Creating output file...\n";
open(OUTPUT, ">$output") || die ("The file $output could not be created.\n");

# Open the annotation file for reading
print "Opening annotation file '$annotationfile'...\n";

open (ANNOTATIONFILE, "$annotationfile") || die "Can't open files.\n";  

#do what's in the braces to every line in the file one line at a time.

print "processing annotation file...\n";

while(<ANNOTATIONFILE>){	

    #take the line breaks off.
    chomp;
    next if /^!/;          # ignore comments by erasing them
    next if /^(\s)*$/;  # skip blank lines
 
 	#substitute any non-IEA evidence code with the string 'non-IEA".
  	$_ =~ s/(\tIC\t)|(\tIDA\t)|(\tIEP\t)|(\tIGI\t)|(\tIMP\t)|(\tIPI\t)|(\tISS\t)|
  					 (\tRCA\t)|(\tNAS\t)|(\tND\t)|(\tTAS\t)|(\tNR\t)/\tnon-IEA\t/;
    
	@columnsarray = split( '\t'); #put the columns into an array of 15 scalars per line.    
	   
	 $line =  "$columnsarray[6] annotations from $columnsarray[0] for species with $columnsarray[12].\n"; 
	 
	print OUTPUT "$line";
	
			





#end of 'while' thing.
}	


print "finished processing annotation file...\n";
#we are done with this file now.
close ANNOTATIONFILE;	














