#!/usr/bin/perl -w
#	Monthly Report.pl
#	A full analysis of terms and their movements within GO.
#
#	It is recommended that this script be run after Checks.pl,
#	which ensures that the ontologies don't contain errors.
#
#	What this script reports:
#	- new terms
#	- terms obsoleted
#	- terms that are now synonyms of other terms
#	- term name changes
#	- new definitions
#	- term movements between GO Slim terms
#	- general stats
#
#	Files needed:
#	- copies of the current ontology files, stored in go/ontology/
#	- copies of an old set of ontology files, stored in go/ontology/old/
#	- copy of the defs file, stored in go/doc/GO.defs
#	- an old defs file, stored in go/doc/old/GO.defs
#	- the Sourceforge IDs file, stored in go/SFids
#	- a GO Slim file, stored in go/goslim/goslim.txt
#
#	Script by Amelia Ireland
#	Comments, bugs, complete-lack-of-functionality complaints to
#	aji@ebi.ac.uk
#

#$mode = 0;
#print "Please select a running mode for this script.\nMonthly report generator: [V]erbose, [C]oncise;\n[O]ntology checker\n[R]edundant relationship checker\n";
#while ($mode == 0)
#{	$response = <STDIN>;
#	if ($response =~ /[vV]/)
#	{	print "Monthly report generator, verbose mode.\n";
#		$mode = 1;
#	}
#	elsif ($response =~ /[cC]/)
#	{	print "Monthly report generator, concise mode.\n";
#		$mode = 1;
#	}
#	else
#	{	print "That is not a valid option. Please try again.\n";
#	}
#}

#	importing the goslim terms
printf ("Importing GO Slim terms... ");
$gsFile = "go/goslim/goslim.txt";
open(gsFILE, '<'.$gsFile) or die "Cannot find file $gsFile.\n";
while (<gsFILE>)
{ $Line = $_;
	if (/\A .*?[<%](.*?) ; (GO:\d{7})/)
	{	$gsTerms{$2} = 99;
	}
	elsif (/version: \$Revision: 1.3 $/)
	{	$rev = $1;
	}
	elsif (/GO_slim_name: (.*)/)
	{	$goSlim = $1;
	}
}
print "done.\n";

$OutFile = "Report.txt";
open(OUTFILE, '>'.$OutFile) or die "The file $OutFile could not be created.\n";
$date = localtime();
printf OUTFILE ("GO Monthly Release Notes\n========================\nGenerated on $date\n");
printf OUTFILE ("GO Slim $goSlim, revision $rev used.\n\n");
printf OUTFILE ("Key:\n
|----1----||----2-----||3||----4----||-----------5-----------|
GO:0009941  GO:0009536  D  SF:575119  chloroplast envelope

1. GO ID number
2. GO-slim term that new term was added under
3. indicates the existence of a definition for the term
4. the sourceforge request ID the term was added in response to
5. term name\n\n");

%cur = (1 => 'SGD', 1501 => 'MGD', 3001 => 'none', 3674 => 'OS', 7639 => 'none', 8001 => 'FB', 9501 => 'TAIR', 11001 => 'SP', 12501 => 'JCVI', 14001 => 'none', 15000 => 'FB', 20001 => 'PSU', 21501 => 'none', 30001 => 'MAH', 35001 => 'FB', 40001 => 'WB', 42001 => 'JL', 45001 => 'AI', 48001 => 'CB', 50001 => 'none'); 

#	importing the sourceforge IDs
printf ("Importing Sourceforge IDs... ");
$sfFile = "go/SFids";
open(sfFILE, '<'.$sfFile) or die "Cannot find file $sfFile.\n";
while (<sfFILE>)
{	$Line = $_;
	if (/GO:/)
	{	chomp($Line);
		$Line =~ s/ //g;
		while ($Line ne "")
		{	if ($Line =~ /\A(GO:\d{7})/)
			{	$source{$1} = $sf;
				$Line =~ s/$1//g;
			}
		}
	}
	elsif (/(\d{6})/)
	{	$sf = $1;
	}
}
print "done.\n";

$lastNest = 0;
$gs = "";
%ontology = (0 => 'component', 1 => 'function', 2 => 'process');
$ont = 0;

@Ontologies = ("go/ontology/old/component.ontology", "go/ontology/old/function.ontology", "go/ontology/old/process.ontology");
foreach $InFile (@Ontologies)
{	open(INFILE, '<'.$InFile) or die "Cannot find file $InFile.\n";
	@path = ();
	printf ("Loading old %s ontology... ", $ontology{$ont});
	while (<INFILE>)
	{	$Line = $_;
		$obs = 0;
		if (/\A(.*?)[<%](.*?) ; (GO:\d{7})/)
		{	$thisNest = length($1);
			$goid = $3;
			$term = $2;
			$term =~ s/\\//g;
			
			#	work out the path for the term
			$a = $lastNest - $thisNest;
			$a++;
			while ($a != 0)
			{	pop(@path);
				$a--;
			}
			push(@path, $goid);
			$lastNest = $thisNest;

			foreach $i (@path)
			{	#	sets obsolete flag for term
				if ($i =~ /GO:00083(69|70|71)/)
				{	$obs = 1;
				}
				#	finds the GO Slim parent of the term
				if (exists($gsTerms{$i}))
				{	$gs = $i;
				}
			}
			
			if (exists($terms{$goid}))
			{	#	add the GO Slim parent, checking it doesn't already exist
				$flag = 0;
				foreach $i (@{$oldGS{$goid}})
				{	if ($i eq $gs)
					{	$flag = 0;
						last;
					}
				}
				if ($flag == 0)
				{	push(@{$oldGS{$goid}}, $gs);
				}
				
			}
			else
			{	#	create a record of the term - name, ontology, obsolete, GS parent
				$terms{$goid} = 1;
				$ont{$goid} = $ont;
				$name{$goid} = $term;
				$obs{$goid} = $obs;
				$oldGS{$goid} = [ $gs ];
			}
		}
	}
	$ont++;
	print "done.\n"
}

print "Finished loading old ontology files.\n";

$ont = 0;
@Ontologies = ("go/ontology/component.ontology", "go/ontology/function.ontology", "go/ontology/process.ontology");
foreach $InFile (@Ontologies)
{	open(INFILE, '<'.$InFile) or die "Cannot find file $InFile.\n";
	@path = ();
	$parent = "";
	printf ("Loading current %s ontology...\n", $ontology{$ont});
	while (<INFILE>)
	{	$Line = $_;
		$obs = 0;
		if (/\A(.*?)[<%](.*?) ; (GO:\d{7})/)
		{	$thisNest = length($1);
			$goid = $3;
			$term = $2;

			if ($term =~ /\\/)
			{	$term =~ s/\\//g;
			}

			$a = $lastNest - $thisNest;
			$a++;
			while ($a != 0)
			{	pop(@path);
				$a--;
			}
			push(@path, $goid);
			$lastNest = $thisNest;

			foreach $i (@path)
			{	if ($i =~ /GO:00083(69|70|71)/)
				{	$obs = 2;
				}
				if (exists($gsTerms{$i}))
				{	$gs = $i;
				}
				#	find the direct parent
				if ((scalar(@path) >= 2) && ($i ne $goid))
				{	$parent = $i;
				}
			}
			
			#the meaty chunks bit
			if (exists($terms{$goid}))
			{	if ($terms{$goid} !~ /2/)
				#do these checks once - updates status of old terms
				{	$terms{$goid} .= 2;
					$obs{$goid} .= $obs;
					$newGS{$goid} = [ $gs ];
					$newPar{$goid} = $parent;

					#check for term name changes 
					if ($name{$goid} ne $term)
					{	#printf "Term name change: %s --> $term\n", $name{$goid};
						push(@{$newName{$ont{$goid}}}, $goid);
						$oldName{$goid} = $name{$goid};
						$name{$goid} = $term;
					}

					#add the synonyms
					if (/\A.*?[<%].*? ; GO:\d{7}, (GO:\d{7}.*)/)
					{	$j = $1;
						$j =~ s/, //g;
						if ($j =~ /(.*?)[<%;]/)
						{	$j = $1;	}
						while (length($j) > 1)
						{	if ($j =~ /\A(GO:\d{7})/)
							{	$syn{$1} = $goid;
								$j =~ s/\AGO:\d{7}//g;
							}
						}
					}
				}

				#do these checks every time
				else
				{	$flag = 0;
					foreach $i (@{$newGS{$goid}})
					{	if ($i eq $gs)
						{	$flag = 1;
							last;
						}
					}
					if ($flag == 0)
					{	push(@{$newGS{$goid}}, $gs);
					}

					if ($newPar{$goid} !~ /$parent/)
					{	$newPar{$goid} .= $parent;
					}
				}
			}

			else #a completely brand new term!
			{	$terms{$goid} = 2;
				$ont{$goid} = $ont;
				$name{$goid} = $term;
				$obs{$goid} = $obs;
				$newGS{$goid} = [ $gs ];
				$newPar{$goid} = $parent;
				#synonyms
				if (/\A.*?[<%].*? ; GO:\d{7}, (GO:\d{7}.*)/)
				{	$j = $1;
					$j =~ s/, //g;
					if ($j =~ /(.*?)[<%;]/)
					{	$j = $1;	}
					while (length($j) > 1)
					{	if ($j =~ /\A(GO:\d{7})/)
						{	$syn{$1} = $goid;
							$j =~ s/\AGO:\d{7}//g;
						}
					}
				}
			}
		}
	}
	$ont++;
	print "done.\n"
}

#removing obsolete GO SLIM terms
foreach $i (sort keys %gsTerms)
{	if (!exists($terms{$i}))
	{	print "Error: GO SLIM term $i no longer exists.\n";
		delete $gsTerms{$i};
	}
	elsif ($obs{$i} =~ /[12]/)
	{	print "Error: GO SLIM term $i has been made obsolete.\n";
		delete $gsTerms{$i};
	}
}

#new terms, synonyms and lost terms
foreach $i (sort keys %terms)
{	#	new terms
	if ($terms{$i} == 2)
	{	#print "New term\n";
		if ($obs{$i} !~ /2/)
		{	push(@{$new{$ont{$i}}}, $i);
		}
		delete($newPar{$i});
	}
	#	terms lost without a trace
	elsif ($terms{$i} == 1)
	{	if (exists($syn{$i}))
		{	#print "$i has been merged into $syn{$i}\n";
			push(@{$newSyn{$ont{$i}}}, $i);
		}
		else
		{	print "Term lost: $i, $name{$i}\n";
		}
		delete($terms{$i});
	}
	else
	#terms in both ontologies
	{	if ($obs{$i} !~ /[12]/)
		{	#	compare the old goslim terms with those in the current ontology and see if there have been any movements.
			@oldGS = @{$oldGS{$i}};
			@newGS = @{$newGS{$i}};

			my %tracker = ();
			$tracker{$_} .= 1 for @oldGS;
			$tracker{$_} .= 2 for @newGS;
			for (sort keys %tracker)
			{	if ($tracker{$_} !~ /1/)
				{	push(@{$movedTo{$_}}, $i);
				}
				elsif ($tracker{$_} !~ /2/)
				{	push(@{$movedFrom{$_}}, $i);
				}
			}
		}
	}
}

#obsoleted terms
foreach $i (sort keys %obs)
{	if ($obs{$i} =~ /[12]/)
	{	delete($newPar{$i});
	}

	#	terms that are newly obsolete
	if (($obs{$i} =~ /2/) && ($obs{$i} !~ /1/))
	{	push(@{$newObs{$ont{$i}}}, $i);
	}
	
}

$odFile = "go/doc/old/GO.defs";
$ndFile = "go/doc/GO.defs";

open(oldDefFILE, '<'.$odFile) or die "Cannot find file $odFile.\n";
open(newDefFILE, '<'.$ndFile) or die "Cannot find file $ndFile.\n";

while (<oldDefFILE>)
{	$Line = $_;
	if (/goid: (GO:\d{7})/)
	{	$goid = $1;
	}
	if (/definition: (.*)/)
	{	$defstr{$goid} = $1;
		$def{$goid} = 1;
	}
}


$new = 0;
while (<newDefFILE>)
{	$Line = $_;
	if (/goid: (GO:\d{7})/)
	{	$goid = $1;
	}
	if (/definition: (.*)/)
	{	if (exists($def{$goid}))
		{	$def{$goid} .= 2;
			if ($defstr{$goid} ne $1)
			{	push(@defChange, $goid);
			}
			delete($defstr{$goid});
		}
		else
		{	$def{$goid} = 2;
		}
	}
	if ((/comment: (.*)/) && (exists($obs{$goid})))
	{	$comm = $1;
		if ($comm =~ /This term was made obsolete because (.{3})(.*)/)
		{	if ($1 eq "it ")
			{	$comment{$goid} = $2;
			}
			else
			{	$comment{$goid} = $1.$2;
			}
		}
		else
		{	$comment{$goid} = $comm;
		}
	}
}

close(oldDefFILE);
close(newDefFILE);

foreach $i (sort keys %def)
{	if ($def{$i} !~ /2/)
	{	if (exists($syn{$i}))
		{	#term has been made a synonym
			delete ($def{$i});
		}
		else
		{	print "Def lost: $i, $name{$i}: $defstr{$i}\n";
			delete ($def{$i});
		}
	}
	elsif ($def{$i} !~ /1/)
	{	if ($obs{$i} !~ /2/)
		{	push(@{$newDef{$ont{$i}}}, $i);
		}
	}
}

foreach $j (keys %ontology)
{	if (exists($new{$j}))
	{	$a = scalar(@{$new{$j}});
		print OUTFILE "New terms in $ontology{$j} ontology ($a new terms)\n";
		foreach $i (@{$new{$j}})
		{	print OUTFILE "$i\t";
	
			if ($i =~ /GO:(\d{7})/)
			{	$id = $1;
			}
	
			$temp = 0;
			foreach $k (sort {$a <=> $b} keys %cur)
			{	if ($id > $k)
				{	$temp = $k;
				}
			}
			print OUTFILE "$cur{$temp}\t" unless ($cur{$temp} eq 'none');
			if ($cur{$temp} eq 'none')
			{	print "Error: $i\n";
				print OUTFILE "\t";
			}
	
			$first = shift(@{$newGS{$i}});
			print OUTFILE "$first\t";
			if (exists($def{$i}))
			{	print OUTFILE "D\t";
			}
			else
			{	print OUTFILE "\t";
			}

			if (exists($source{$i}))
			{	print OUTFILE "$source{$i}\t";
			}
			else
			{	print OUTFILE "\t";
			}

			print OUTFILE "$name{$i}\n";
			while (scalar(@{$newGS{$i}}) != 0)
			{	$first = shift(@{$newGS{$i}});
				print OUTFILE "\t\t\t$first\n"
			}
		}
	}
	if (exists($newObs{$j}))
	{	print OUTFILE "\n\nNew obsoletions in $ontology{$j} ontology\n";
		foreach $i (@{$newObs{$j}})
		{	print OUTFILE "$i, $name{$i}: ";
			if (exists($comment{$i}))
			{	print OUTFILE "$comment{$i}\n";
			}
			else
			{	print OUTFILE "no reason given\n";
			}
		}
	}
	
	if (exists($newSyn{$j}))
	{	print OUTFILE "\n\nNew term merges in $ontology{$j} ontology\n";
		foreach $i (@{$newSyn{$j}})
		{	print OUTFILE "$i has been merged into $syn{$i}, $name{$syn{$i}}\n";
		}
	}
	
	if (exists($newName{$j}))
	{	print OUTFILE "\n\nTerm name changes in $ontology{$j} ontology\n";
		foreach $i (sort(@{$newName{$j}}))
		{	print OUTFILE "$i: $oldName{$i} --> $name{$i}\n";
		}
	}
	
	if (exists($newDef{$j}))
	{	$a = scalar(@{$newDef{$j}});
		print OUTFILE "\n\nNew definitions for $ontology{$j} ontology terms ($a new definitions)\n";
		foreach $i (@{$newDef{$j}})
		{	print OUTFILE "$i, $name{$i}\n";
		}
	}
	print OUTFILE "\n";
}

foreach $i (sort keys %newPar)
{	$string = $newPar{$i};
	while ($string ne "")
	{	if ($string =~ /\A(GO:\d{7})/)
		{	push(@{$parents{$i}}, $1);
			$children{$1} .= $i;
			$string =~ s/$1//g;
		}
	}
}

for ($i = 0; $i < 3; $i++)
{	print OUTFILE "\nTerm movements in $ontology{$i} ontology\n";
	foreach $j (sort keys %gsTerms)
	{	if ($ont{$j} == $i)
		{	if (exists($movedTo{$j}))
			{	print OUTFILE "Terms added under $j, $name{$j}:\n";
				%testList = ();
				%testGroup = ();
				%status = ();
				foreach $k (@{$movedTo{$j}})
				{	#	print "$k";
					$status{$k} = 0;
					if (!exists($children{$k}))
					{	$testList{$k} = 0;
						#	print " has no children.\n";
					}
					else
					{	$testGroup{$k} = 0;
						#	print " has children.\n";
					}
				}
				#	print "\n";

				while (scalar(%testList))
				{	#	print "LOOP\n";
					#	foreach $l (keys %testList)
					#	{	print "TL: $l\n";
					#	}
					#	foreach $l (keys %testGroup)
					#	{	print "TG: $l\n";
					#	}
					
					foreach $l (keys %testList)
					{	#	print "$l has parents ";
						@list = @{$parents{$l}};
						foreach $m (sort @list)
						{	#	print "$m ";
							if (exists($status{$m}))
							{	#	print "\n$l has parent $m in the test group!\n";
								$testList{$m} = 0;
								$status{$l} = 1;
								$status{$m} = 2;
								delete $testGroup{$m};
							}
						}
						#	print "\n";
						delete $testList{$l};
					}
				}					

				foreach $x (sort keys %status)
				{	if ($status{$x} == 2)
					{	print OUTFILE "$x and children\n";
					}
					if ($status{$x} == 0)
					{	print OUTFILE "$x\n";
					}
				}
				print OUTFILE "\n";
			}
			
			if (exists($movedFrom{$j}))
			{	print OUTFILE "Terms removed from $j, $name{$j}:\n";
				%testList = ();
				%testGroup = ();
				%status = ();
				foreach $k (@{$movedFrom{$j}})
				{	$status{$k} = 0;
					if (!exists($children{$k}))
					{	$testList{$k} = 0;
					}
					else
					{	$testGroup{$k} = 0;
					}
				}

				while (scalar(%testList))
				{	foreach $l (keys %testList)
					{	@list = @{$parents{$l}};
						foreach $m (sort @list)
						{	if (exists($status{$m}))
							{	$testList{$m} = 0;
								$status{$l} = 1;
								$status{$m} = 2;
								delete $testGroup{$m};
							}
						}
						delete $testList{$l};
					}
				}					

				foreach $x (sort keys %status)
				{	if ($status{$x} == 2)
					{	print OUTFILE "$x and children\n";
					}
					if ($status{$x} == 0)
					{	print OUTFILE "$x\n";
					}
				}
				print OUTFILE "\n";
			}
		}
	}
}


$a = scalar(@defChange);
if ($a != 0)
{	print "$a definitions changed.\n";
}

#statistics
$comp = 0; $func = 0; $proc = 0; $tot = 0;
$cd = 0; $fd = 0; $pd = 0; $td = 0;

foreach $i (keys %terms)
{	if ($ont{$i} == 0)
	{	$comp++; $tot++;
		if (exists($def{$i}))
		{	$cd++; $td++;
		}
	}
	
	elsif ($ont{$i} == 1)
	{	$func++; $tot++;
		if (exists($def{$i}))
		{	$fd++; $td++;
		}
	}
	
	elsif ($ont{$i} == 2)
	{	$proc++; $tot++;
		if (exists($def{$i}))
		{	$pd++; $td++;
		}
	}
}

printf OUTFILE ("\nStatistics:\nComponent: $comp terms, %.1f%% defined ($cd terms defined)\n", $cd/$comp*100);
printf OUTFILE ("Function: $func terms, %.1f%% defined ($fd terms defined)\n", $fd/$func*100);
printf OUTFILE ("Process: $proc terms, %.1f%% defined ($pd terms defined)\n", $pd/$proc*100);
printf OUTFILE ("Total: $tot terms, %.1f%% defined ($td terms defined)\n", $td/$tot*100); 

exit(0);
