#!/usr/bin/perl -w
#	OntologyChecks.pl
#
#	WARNING: script not finished - work in progress!
#
#	Checks done:
#	Defs file
#	- conflicts
#	- comments format
#	- illegal characters
#	- first letter not capitalized
#	- last letter not full stop
#	- EC number formatting
#	- double colons
#	- obsolete formatting
#
#	Ontology files
#	- conflicts
#	- terms with same name, diff GOids
#	- secondary ID format
#	- <new term>, <new synonym>, XX:<new dbxref>
#	- terms moved to diff ontology
#	- terms in 1+ ontology
#	- redundant rlnships (inc signs)
#	- repeated is-a/part-of rlnships
#	- bad dbx formatting
#	- ISBN/PMID/GO as general dbxref
#	- obs term rlnships
#	- obs terms elsewhere in the ontology
#	- terms or definitions lost without a trace
#
#	Files needed:
#	- copies of the current ontology files, stored in go/ontology/
#	- copies of an old set of ontology files so the script can check for
#	lost terms, etc., stored in go/ontology/old/
#	- copy of the defs file, stored in go/doc/GO.defs
#	- an old defs file, stored in go/doc/old/GO.defs
#
#	Checks.pl creates a data file with a brief summary of old ontology
#	contents. It compares this to the current state of the ontologies and
#	reports on the changes. It creates a new version of the ontologies
#	with some of the above mistakes corrected and the others documented.
#
#	New additions:
#	Added Synonymizer.pl functionality (finds the synonyms and prompts to update
#	the synonyms file)
#
#	Checks for secondary IDs:
#	- secIDs don't match extant terms
#	- secIDs aren't shared by more than one term / move between terms
#
#	Checks OBSOLETE not in defs of non-obsolete terms
#
#	Script by Amelia Ireland
#	Comments, bugs, complete-lack-of-functionality complaints to
#	aji@ebi.ac.uk
#
#	TO DO:
#	- code to isolate completely loopy comments
#	- check syns/refs always followed by a reference ID (is this necessary?)
#	- update redundancy code to remove redundancies (may not be worthwhile...)
#	- check whether or not the synonyms and obsoletes need to be updated in CVS
#	- add the Dictionary to this script
#


#if (-e $dataFile)
#{	
#	files required by the program

$dataFile = "go/data.txt";
@Ontologies = ("go/ontology/component.ontology", "go/ontology/function.ontology", "go/ontology/process.ontology");
$DefFile = "go/doc/GO.defs";
$SynFile = "go/doc/synonyms/Synonyms.txt";

#	new files created by the program
$LogFile = "OntologyChecks Log";
@NewOntologies = ("go/ontology/newcomponent.ontology", "go/ontology/newfunction.ontology", "go/ontology/newprocess.ontology");
$DefOutFile = "go/doc/newGO.defs";
$SynOutFile = "go/doc/synonyms/newSynonyms.txt";

@OldOntologies = ("go/ontology/oldcomponent.ontology", "go/ontology/oldfunction.ontology", "go/ontology/oldprocess.ontology");

#	create a new Synonyms mapping file
open(SYNOUTFILE, '>'.$SynOutFile) or die "The file $SynOutFile could not be created.\n";

#	opens the existing synonyms file and gets the synonym data
open(SYNFILE, '<'.$SynFile) or die "The file $SynFile could not be found.\n";
while (<SYNFILE>)
{	$Line = $_;
	if ($Line !~ /^!/)
	{	$Line =~ s/"//g;
		if ($Line =~ /(GO:\d{7})\t.*?\t(.*?)\t(.*)/)
		{	$goid = $1;
			$string = $3;
			$sign = $2;
			$synTerm{$goid} = 1;
			$string =~ s/(.*?)\t.*/$1/;
			push(@{$syn{$goid}}, $string);
			$type{$goid.$string} = $sign;
			if ($sign !~ /^([=\<\>\~]|!=)$/)
			{	#	print "$Line";
				push(@{$newSynz{$goid}}, $string);
			}
		}
	}
	else
	{	print SYNOUTFILE "$Line";
	}
}

%ontology = (0 => 'cellular component', 1 => 'molecular function', 2 => 'biological process');
%obsId = (0 => '0008370', 1 => '0008369', 2 => '0008371');
%Ont = (0 => 'CO', 1 => 'FO', 2 => 'PO');

open(LOGFILE, '>'.$LogFile) or die "The file $LogFile could not be created.\n";
$dataFile = "go/data.txt";
if (-e $dataFile)
{	open(DATA, '<'.$dataFile);
	while (<DATA>)
	{	$Line = $_;
		#		 ont		def	 	obs		goid		  name
		if (/([012])\t([01])\t([012])\t(GO:\d{7})\t(.*?)\t.*/)
		{	$goid = $4;
			$name = $5;
			$old{$4} = 0;
			$ont{$4} = $1;
			$def{$4} = $2;
			$obs{$4} = $3;
			if ($name =~ /(GO:\d{7})/)
			{	$secID{$goid} = $1; }
			else
			{	$name{$goid} = $5; }
		}
		else
		{	print "Error in data file!\n";
		}
	}
	close(DATA);
	print "Loaded old data file.\n";
}

$error = "";
$ont = 0;
foreach $InFile (@Ontologies)
{	open(INFILE, '<'.$InFile) or die "The file $InFile could not be found.\n";
	$OutFile = $NewOntologies[$ont];
	open(OUTFILE, '>'.$OutFile) or die "The file $OutFile could not be created.\n";
	$LineNo = 1;
	$lastGoid = "";
	$lastNest = 0;
	%term = ();
	@path = ();
	print "Loading current $ontology{$ont} ontology...\n";
	while (<INFILE>)
	{	$Line = $_;
		$core = $_;
		if ($core =~ /(.*?[<%].*? )[<%].*/)
		{	$core = $1;
		}
		$obs = 0;

#	Ontology file conflicts
		if ((/<<</) || (/===/) || (/>>>/))
		{	print "Error - conflict found in the ontology.\n";
			print LOGFILE "$Ont{$ont} $LineNo: conflict marker in ontology.\n";
			
#	!saved-by:
#	!date:
#	!version:

		}

		if ($Line =~ /^(\s{1,})([<%])(.*?) ; (GO:\d{7})/)
		{	$thisNest = length($1);
			$goid = $4;
			$term = $3;
			$sign = $2.$goid;
			$s = $2;
			$path = "";
			$term =~ s/\\//g;
			
#			while ($term =~ /(\S+)\s*/g)
#			{	$w = $1;
#				if ($w =~ /[A-Za-z]/)
#				{	$word{$w}++;
#				}
#			}

			$new{$goid} = 1;

#	Ontology file repeated is-a/part-of rlnships
			if ($goid eq $lastGoid)
			{	print "Repeated line: $LineNo, $goid.\n";
				print LOGFILE "$Ont{$ont} $LineNo: repeated line. $ontology{$ont} ontology\n";
			}
			$lastGoid = $goid;
			
			$a = $lastNest - $thisNest;
			$a++;
			while ($a != 0)
			{	#$b = pop(@path);
				pop(@path);
				$a--;
			}
			
			foreach $i (@path)
			{	if ($i =~ /$obsId{$ont}/)
				{	$obs = 1;
#	Ontology file incorrect obsolete term rlnships
					if ($s eq "<")
					{	print "Obsolete relationship needs to be changed: $goid.\n";
						print LOGFILE "$Ont{$ont} $LineNo: obsolete relationship needs to be changed: $goid\n";
						$error .= $ont;
						if ($Line =~ /\A(\s{1,})<(.*)/)
						{	$Line = $1."%".$2."\n";
						}
					}
				}
				$path .= $i;
			}
			$path .= $s;
			push(@path, $sign);
			$lastNest = $thisNest;

#	Ontology file redundant rlnships (inc signs)
			if (exists($paths{$goid}))
			{	@tempList = @{$paths{$goid}};
				foreach $i (@tempList)
				{	$t = chop($i); # but don't save these changes!
					if (($i =~ /$path/) || ($path =~ /$i/))
					{	if ($s eq $t)
						{	$red{$goid} = 1;
#these could be deleted but only if it was the line just checked that was redundant
						}
						else
						{	$red{$goid} = 2;
						}
					}
				}
				$path .= $s;
				push(@{$paths{$goid}}, $path);
			}
			else
			{	$path .= $s;
				$paths{$goid} = [ $path ];
			}

#--------START IMPORTANT BIT
			#the meaty chunks bit
			if (exists($name{$goid}))	#it's not a new term
			{
#	Ontology file terms moved to diff ontology
#	Ontology file terms in 1+ ontology
#
#	How are we going to store this info? Want to know which ontologies it's in.
#	Also know which ontology it should be in - it should be in the original ontology
#	and removed from the one it has been added to. Wouldn't be able to tell if term
#	is new.
				if ($ont{$goid} != $ont)
				{	print "Error - $goid in more than one ontology.\n";
				}
				
				#check for obsoletion errors
				if ($obs{$goid} != $obs)
				{	if ($obs{$goid} == 0)
					#	the term has been obsoleted
					{	$obs{$goid} = 1;
					}
#	Ontology file obs terms elsewhere in the ontology
					elsif ($obs{$goid} == 1)
					#	the term was/is obsolete but is active! Uh-oh!
					{	print "Error! Obsolete term in active ontology! $goid\n";
						print LOGFILE "Obsolete term in active ontology, $goid\n";
						$obs{$goid} = 2;
						#	need to change this when the error is sorted out.
					}
				}

				if (!exists($done{$goid}))
				{	$done{$goid} = 1;
					$old{$goid} += 10;

					#check for term name changes 
					if ($name{$goid} ne $term)
					{	#printf "Term name change: %s --> $term\n", $name{$goid};
						$name{$goid} = $term;
					}
					
					#	check that the term is not also a secondary ID
					if (exists $secID{$goid})
					{	print "Existing term $goid is also a secondary ID of $secID{$goid}.\n";
					}

#	Ontology file terms with same name, diff GOids
					#check for identical term names
					if ($obs != 1)
					{	if ((exists($nameCheck{$term})) && ($nameCheck{$term} ne $goid))
						{	#	if ($term ne "obsolete") {
							print "Name repeated: $term, $nameCheck{$term} and $goid\n";
							if (exists($nameTwice{$term}))
							{	if ($nameTwice{$term} !~ /$goid/)
								{	$nameTwice{$term} .= ", ".$goid;
								}
							}
							else
							{	$nameTwice{$term} = $nameCheck{$term}.", ".$goid;
							}
						}
						else
						{	$nameCheck{$term} = $goid; }	
					}
				}
			}

			else #a completely brand new term!
			{	$ont{$goid} = $ont;
				$name{$goid} = $term;
				$obs{$goid} = $obs;
				$def{$goid} = 0;
				if (($ont == 1) && ($obs == 0))
				{	if ($term !~ /activity/)
					{	print "Possible error with new function term: $term\n";
					}
				}
				
#	Ontology file terms with same name, diff GOids
				if ($obs != 1)
				{	if ((exists($nameCheck{$term})) && ($nameCheck{$term} ne $goid))
					{	#	if ($term ne "obsolete") {
						print "Name repeated: $term, $nameCheck{$term} and $goid\n";
						$nameTwice{$term} .= ", ".$goid;
						if ($ont{$nameCheck{$term}} == $ont{$goid})
						{	print "Same ontology.\n";
						}
						#	}
					}
					else
					{	$nameCheck{$term} = $goid; }	
				}
				$done{$goid} = 1;
			}
#--------END IMPORTANT BIT
		}

#	find the synonyms for the synonyms script
		if (/synonym/ and !exists $synDone{$goid} and $obs == 0)
		{	$ref = $Line;
			$ref =~ s/\\//g;
			while ($ref =~ /synonym:(.*?)($| ; .*| [<%])/g)
			{	#	synonym formatting error - secondary ID as a synonym
				$synTerm{$goid} .= 2;
				if ($1 =~ /(GO:\d{7})/)
				{	$error .= $ont;
					print "$Ont{$ont} $LineNo: Formatting error - secondary ID\n";
					print LOGFILE "$Ont{$ont} $LineNo: synonym formatting error\n";
					push(@synonymFormat, $1);
					#reformat
					$Line =~ s/ ; synonym:$1//g;
					$Line =~ s/$goid/$goid, $1/g;
				}
				else
				{	$synonym = $1;
					push(@{$syns{$goid}}, $synonym);
					while ($synonym =~ /(\S+)\s*/g)
					{	$w = $1;
						if ($w =~ /[A-Za-z]/)
						{	$word{$w}++;
						}
					}
				}
				$ref = $2;
			}
			$synDone{$goid} = 1;
		}

#	Error: GOID as a general dbxref
		if (/(GO:\d{7}) ; (GO:\d{7})/)
		{	$error .= $ont;
			print "$Ont{$ont} $LineNo: Formatting error - secondary ID\n";
			print LOGFILE "$Ont{$ont} $LineNo: GOID as general dbxref\n";
			push(@synonymFormat, $1);
			#offer to reformat
			$Line =~ s/$1 ; $2/$1, $2/g;
		}

#	start SECONDARY ID CODE
		#add the secondary IDs
		if ($core =~ /GO:\d{7}, (GO:\d{7}.*)/)
		{	$j = $1;
			while ($j =~ /(GO:\d{7})/g)
			{	#	check the secondary ID against what was in the data file
				if (exists $secID{$1} and $secID{$1} ne $goid)
				{	print "$1 is a secondary ID for $goid (current) and $secID{$1} (old).\n";
				}

				#	check that the secondary ID does not exist already in the ontologies as a secID for something else
				if (exists $secondary{$1} and $secondary{$1} ne $goid)
				{	print "$1 is a secondary ID for $goid and $secondary{$1} in current files.\n";
				}
				$secondary{$1} = $goid;
			}
		}
#	end SECONDARY ID CODE

#	Ontology file <new term>, <new synonym>, XX:<new dbxref>
		if (/\>/)
		{	print "Error! \> in ontology!\n$Line";
			print LOGFILE "$Ont{$ont} $LineNo: \> in ontology: check for new synonyms, terms or dbxrefs\n";
			#	removes any blank references straight away
			if (/:\\<new/)
			{	$error .= $ont;
				$Line =~ s/ ; XX:\\<new dbxref>//g;
				$Line =~ s/ ; synonym:\\<new synonym>//g;
			}

			if (/\\<new term> ; (GO:\d{7})/)
			{	#	we're in trouble!
				print "No name: $1\n$Line";
				#	push(@noName, $1);
				#	either enter a name or remove all refs to it?
			}

			print "$Line";
		}
		
#	Ontology file ISBN/PMID/GO as general dbxref
#	check for incorrect dbxrefs
		if (/(PMID:.*|ISBN:.*|GO:\D{1,}.*)/)
		{	$ref = $1;
			$ref =~ s/ [<;%].*//g;
			$error .= $ont;
			$ref = refFormatter($ref);
			print "$Ont{$ont} $LineNo: incorrect general dbxref: $goid, $ref\n$Line";
			print LOGFILE "$Ont{$ont} $LineNo: incorrect general dbxref: $goid, $ref\n";
			
			if ((exists($def{$goid})) && ($def{$goid} == 1))
			{	$Line =~ s/ ; $ref//g;
				$extraRef{$goid} = $ref;
			}
			else
			{	print "Incorrect dbxref, cannot be moved: $goid, $ref\n$Line";
				$extraRef{$goid} = $ref;
			}
		}
		
#	EC / TC ref formatting
		if (/; ([ET]C:.*)/)
		{	$ref = $1;
			$ref =~ s/ [<;%].*//g;
			$newRef = refFormatter($ref);
			if ($newRef ne $ref)
			{	print "Reformatted $ref to $newRef\n";
				print LOGFILE "Reformatted $ref to $newRef\n";
				$Line =~ s/$ref/$newRef/g;
				$error .= $ont;
			}
		}
		print OUTFILE "$Line";
		$LineNo++;
	}
	$ont++;
	print "done.\n"
}

for ($i = 0; $i < 3; $i++)
{	if ($error !~ /$i/)
	{	unlink($NewOntologies[$i]);
	}
	else
	{	rename($Ontologies[$i], $OldOntologies[$i]);
		rename($NewOntologies[$i], $Ontologies[$i]);
	}
}


#	check the secondary IDs. Go through the old secIDs and check whether any have been lost or reinstated
foreach $i (keys %secID)
{	if ($secID{$i} =~ /GO\d{7}/)
	{	if (!exists $secondary{$i})
		{	print "secondary ID $i (was secID to $secID{$i}) ";
			if (exists $new{$i})
			{	print "has been reinstated as a term\n";
			}
			else
			{	print "lost\n";
			}
		}
	}
	delete $secID{$i};
}

#	check the new secondary IDs don't have any erroneous data and add them to the list of terms that appear in the new files
foreach $i (keys %secondary)
{	if (exists $name{$i} and exists $new{$i})
	{	print "Error! Name for sec ID $i exists!\n";
	}
	$name{$i} = $secondary{$i};
	$ont{$i} = $ont{$secondary{$i}};
	$obs{$i} = $obs{$secondary{$i}};
	$def{$i} = $def{$secondary{$i}};
	$new{$i} = 1;
}

$error = 0;
@comm = ('This term was made obsolete', 'This term was split from', 'See also', 'Note that');
$obs = 'OBSOLETE (was not defined before being made obsolete).';
$obsDef = 'OBSOLETE. ';
open(DEFFILE, '<'.$DefFile) or die "The file $DefFile could not be found.\n";
open(DEFOUTFILE, '>'.$DefOutFile) or die "The file $DefOutFile could not be created.\n";
print "Opening GO.defs file.\n";
while (<DEFFILE>)
{	$Line = $_;
	chomp($Line);
	#	Look for strange characters

	if ((/<<<<<<</) || (/=======/) || (/>>>>>>>/))
	{	print "Error - conflict found in the ontology.\n";
	}

	if (/[\<\>\{\}\&\|\?\"\`\t\\]/)
	{	unless (/[\d| ]->[\d| ]/ or /http:/ or /-\|-/)
		{	print LOGFILE "possible illegal character error $goid: \"$Line\"\n\n";
		}
	}

	if (/[\x80-\xFF]/)
	{	print LOGFILE "High-ASCII illegal character error $goid: \"$Line\"\n\n";
	}

	if (/ - /)
	{	print LOGFILE "possible illegal symbol string $goid: \"$Line\"\n\n";
	}

	if (/term: (.*)/)
	{	$termName = $1;
		while ($termName =~ /(\S+)\s*/g)
		{	$w = $1;
			if ($w =~ /[A-Za-z]/)
			{	$word{$w}++;
			}
		}
	
	}
	elsif (/goid: (.*)/)
	{	$refFlag = 0;
		$goid = $1;
		$def{$goid} = 1;
		$obsFlag = $obs{$goid};

		if ($termName ne $name{$goid})
		{	print "Error : term name inconsistency! $goid, $name{$goid}\n";
		}

		if (exists($extraRef{$goid}))
		{	$refFlag = 1;
		}

		if (exists($old{$goid}))
		{	if ($old{$goid} < 10)
			{	print "Error - term missing but definition still present: $goid\n";
			}
			else
			{	delete $old{$goid};
			}
		}
	}
	elsif (/definition: (.*)/)
	{	$def = $1;
		$oldDef = $def;
		$def =~ s/\\n/ /g;
		$def =~ s/\.([A-Za-z])/\. $1/g;
		$def =~ s/  / /g;
		if ($def =~ /^[a-z]/)
		{	print "First letter not capitalized: $goid.\n";
			$def = ucfirst($def);
			print LOGFILE "First letter capitalized: $goid.\n";
			$error = 1;
		}
		
		if ($def =~ /[\(\)\[\]\{\}]/)
		{	$temp = $def;
			$temp =~ s/[\[\{]/\(/g;
			$temp =~ s/[\]\}]/\)/g;
			$left = 0; $right = 0;
			while ($temp =~ /\(/g)
			{	$left++;
			}
			while ($temp =~ /\)/g)
			{	$right++;
			}
			if ($left != $right)
			{	print "Possible error with brackets: $goid.\n";
				print LOGFILE "Possible error with brackets: $goid.\n";
			}
		}
			
			
		if ($def !~ /\.$/)
		{	print "Definition did not end with a full stop: $goid:\n";
			$error = 1;
			while (substr($def, -1, 1) eq " ")
			{	chop($def);
			}
			$a = substr($def, -1, 1);
			if ($a =~ /[,;:]/)
			{	chop($def);
				$def .= ".";
			}
			elsif ($a =~ /\)/)
			{	$def .= ".";
			}
			elsif ($a !~ /\./)
			{	$def .= ".";
			}
			print LOGFILE "Definition given full stop at end: $goid.\n";
			
		}
		
		while ($def =~ /(\S+)\s*/g)
		{	$w = $1;
			if ($w =~ /[A-Za-z]/)
			{	$word{$w}++;
			}
		}

		#	if the term is obsolete, check that the def contains the word obsolete somewhere
		#	need to alter this check so it looks at previous data to see if the def has 'obsolete' in it.
		
		if (exists($obs{$goid}) and $obs{$goid} != 0)
		{	if (($def ne $obs) && (substr($def, 0, 10) ne $obsDef))
			{	print "Error in obsolete formatting: $goid.\n$Line\n";
				print LOGFILE "Error in obsolete formatting: $goid.\n$Line\n";
				if ($def =~ /defined/)
				{	$def = $obs;
				}
				else
				{	#FIX!
					print "Replace? Choose an option (n does nothing):\n1: OBSOLETE (undefined)  2: add OBSOLETE to existing def  3: Suggest a new def.\n";
					$response = <STDIN>;
					chomp($response);
					if ($response eq "1")
					{	$def = $obs;
					}
					elsif ($response eq "2")
					{	$def = $obsDef.$def;
					}
					elsif ($response ne "n")
					{	$def = $response;
					}
				}
			}
		}
		if ($def =~ m/obsolete/i and $obs{$goid} == 0)
		{	print "Error - non obsolete term $goid has obsolete in definition!\n";
		}
		if ($def ne $oldDef)
		{	$error = 1;
		}
		$Line = "definition: $def";
	}
	elsif (/definition_reference: (.*)/)
	{	$ref = $1;
		$origRef = $1;
		$ref =~ s/::/:/g;
		while (substr($ref, -1, 1) eq " ")
		{	$ref = substr($ref, 0, length($ref)-1);
		}

		#EC ref formatting
		if (/([ET]C:.*|ISBN.*|PMID.*)/)
		{	$ref = refFormatter($1);
		}

		if ($refFlag == 1)
		{	#check if the ref is the same
			if ($ref eq $extraRef{$goid})
			{	$refFlag = 0;
			}
		}
		if ($origRef ne $ref)
		{	print "Reference reformatted: $goid\n";
			print LOGFILE "Reference reformatted: $goid\n";
			$error = 1;
		}
		$Line = "definition_reference: $ref";
	}
	elsif (/comment: (.*)/)
	{	$obsFlag = 0;
		if ($refFlag == 1)
		{	print DEFOUTFILE "definition_reference: $extraRef{$goid}\n";
			print "Adding dbxref $extraRef{$goid} to $goid.\n";
			$error = 1;
			$refFlag = 0;
		}
		
		$comment = $1;
		$oldComment = $comment;
		$comment =~ s/\\n/ /g;
		$comment =~ s/  / /g;
		$comment =~ s/\.'/'\./g;
		if ($comment =~ /^[a-z]/)
		{	print "First letter of comment not capitalized: $goid.\n";
			print LOGFILE "First letter of comment not capitalized: $goid.\n";
			$comment = ucfirst($comment);
			$error = 1;
		}
		
		if ($comment !~ /\.$/)
		{	print "Comment did not end with a full stop: $goid:\n";
			print LOGFILE "Comment did not end with a full stop: $goid.\n";
			$error = 1;
			while (substr($comment, -1, 1) eq " ")
			{	chop($comment);
			}
			if (substr($comment, -1, 1) =~ /[,;:]/)
			{	chop($comment);
				$comment .= ".";
			}
			elsif (substr($comment, -1, 1) =~ /\)/)
			{	$comment .= ".";
			}
			elsif (substr($comment, -1, 1) !~ /\./)
			{	$comment .= ".";
			}
		}

#To update annotations, use the [molecular function|biological process|cellular component] term '[text] ; GO:[id]'.

#To update annotations, consider the [molecular function|biological process|cellular component] terms '[text1] ; GO:[id1]', '[text2] ; GO:[id2]' and '[text3] ; GO:[id3]' (and its child[ren]).

		#	code to spot errors in the suggesting alternate terms bit...
		if ($comment =~ /To update annotations, (use|consider) the (.*?\.)/i)
		{	$alt = $2;
			$use = 0;
			#if (($1 ne 'use') && ($1 ne 'consider'))
			#{	print "$1 not used or considered\n";
			#}
			
			if ($1 eq 'use')
			{	$use = 1;
	#			if ($alt =~ / node /)
	#			{	print "Error - $alt\n";
	#			}
			}
			
			$i = 0;
			while ($alt =~ /(GO:\S*)/g)
			{	$i++;
			#	push(@temp, $1);
			}
			$j = 0;
			while ($alt =~ /'/g)
			{	$j++;
			}
			if ($j != $i * 2)
			{	print "$goid : Error with number of 's - $j, $i\n";
				print "$alt\n";
				print "Ignore (I) or add?\n";
				$response = <STDIN>;
				chomp($response);
				while ($response eq "")
				{	print "No response. Please try again.\n";
					$response = <STDIN>;
					chomp($response);
				}
				if ($response ne "i")
				{	$alt = $response;
				}
			}
						
			#foreach $i (@temp)
			#{	#find out what ontology they're in
			#}

			while ($alt =~ /\'(.*?); (GO:\S*?)\'/g)
			{	$refName = $1;
				$refGoid = $2;
				if ($refGoid =~ /(GO:\d{7})/)
				{	$refGoid = $1;
					if (exists $secondary{$refGoid})
					{	print "Error: $refGoid is a synonym for $secondary{$refGoid}\n";
						print LOGFILE "Replaced secondary GOID $refGoid with $secondary{$refGoid}.\n";
						$comment =~ s/$refGoid/$secondary{$refGoid}/;
						$refGoid = $secondary{$refGoid};
					}
					
					if (exists $obs{$refGoid} and $obs{$refGoid} != 0)
					{	print "Error, $goid: suggested term $refGoid is obsolete!\n";
					}
					
					if (exists $name{$refGoid} and $refName ne $name{$refGoid}." ")
					{	print "Error with $refGoid term name in comment for $goid\n$name{$refGoid}, $refName.\n";
						print LOGFILE "Comment changed for $goid ($refGoid had incorrect term name)\n";
						$comment =~ s/$refName; $refGoid/$name{$refGoid} ; $refGoid/g;
					}
					
					push(@{$referenced{$refGoid}}, $goid);
					if ($use == 1)
					{	push(@{$replacement{$goid}}, $refGoid); 
					}
					else
					{	push(@{$suggested{$goid}}, $refGoid);
					}
					
					if (($i == 1) && ($alt !~ /its child/))
					{	if ($alt ne "$ontology{$ont{$refGoid}} term '$name{$refGoid} ; $refGoid'.")
						{	print "error with ontology string\n$comment\n";
							$comment =~ s/$alt/$ontology{$ont{$refGoid}} term '$name{$refGoid} ; $refGoid'./;
							print LOGFILE "Comment for $goid incorrect; replaced $alt with $ontology{$ont{$refGoid}} term '$name{$refGoid} ; $refGoid'.\n";
						}
					}
					else
					{	#print "Comment ignored: $goid, $alt\n";
					}
				}
				else
				{	print "$refGoid is not a proper GO term - beware!\n";
				}
			}
		}
		else
		{	$flag = 0;
			foreach $i (@comm)
			{	if ($comment =~ /\A$i/)
				{	$flag = 1;
				}
			}
			if ($flag == 0)
			{	print "Comment formatting error: $goid\n$Line\n";
				print LOGFILE "Comment formatting error: $goid\n";
			}
		}
		if ($oldComment ne $comment)
		{	$error = 1;
		}
		$Line = "comment: ".$comment;
		while ($comment =~ /(\S+)\s*/g)
		{	$w = $1;
			if ($w =~ /[A-Za-z]/)
			{	$word{$w}++;
			}
		}

	}
	elsif ($Line eq "")
	{	if ($refFlag == 1)
		{	print DEFOUTFILE "definition_reference: $extraRef{$goid}\n";
			print "Adding dbxref $extraRef{$goid} to $goid.\n";
			print LOGFILE "Added dbxref $extraRef{$goid} to $goid.\n";
		}
		if ($obsFlag == 1)
		{	print "Missing comment for obsolete term $goid.\n";
			print LOGFILE "Missing comment for obsolete term $goid.\n";
			$obsFlag = 0;
		}
	}		
	print DEFOUTFILE "$Line\n";
}

foreach $i (sort keys %obs)
{	if ($obs{$i} != 0)
	{	push(@Obs, $i);
	}
}

foreach $i (@Obs)
{	if ($def{$i} == 0)
	{	print "No def for obsolete term $i. Add one?\n";
		$response = <STDIN>;
		chomp($response);
		if ($response eq "y")
		{	print OUTFILE "term: $name{$i}\ngoid: $i\ndefinition: $obs\ndefinition_reference: GO:curators\n\n";
			$error = 1;
		}
		else
		{	print "No definition added.\n";
		}
	}
} 

$count = 0;
if ($error == 0)
{	unlink($DefOutFile);
}
else
{	rename($DefFile, "go/doc/oldGO.defs");
	rename($DefOutFile, $DefFile);
}

###	Reprinting the DATA file	###

open(DATA, '>'.$dataFile) or die "The file $dataFile could not be created.\n";
foreach $i (sort keys %new)
{	#if (!exists($obs{$i}))
	#{	$obs{$i} = 0;
	#}
	#if (!exists($def{$i}))
	#{	$def{$i} = 0;
	#}
	print DATA "$ont{$i}\t$def{$i}\t$obs{$i}\t$i\t$name{$i}\t";
	if ($obs{$i} != 0) { $count++; }
	if (exists($referenced{$i}))
	{	foreach $j (sort @{$referenced{$i}})
		{	print DATA "$j";
		}
	}
	print DATA "\n";
}


print "Total obsoletes: $count\n";

$ObsDirFile = "go/doc/obsoletes-exact";
open(OBSDIRFILE, '>'.$ObsDirFile) or die "The file $ObsDirFile could not be created.\n";

$ObsAllFile = "go/doc/obsoletes-inexact";
open(OBSALLFILE, '>'.$ObsAllFile) or die "The file $ObsAllFile could not be created.\n";

print OBSDIRFILE "!version: \$Revision: 1.6 $\n!date: \$Date: 2003/05/30 14:08:57 $\n!Obsolete terms and direct annotation substitutes\n!\n!Obsolete\tAlternative\n";
print OBSALLFILE "!version: \$Revision: 1.6 $\n!date: \$Date: 2003/05/30 14:08:57 $\n!Obsolete terms and possible annotation substitutes\n!\n!Obsolete\tAlternative\n";

$count = 0;
foreach $i (sort keys %replacement)
{	foreach $j (sort @{$replacement{$i}})
	{	print OBSDIRFILE "$i\t$j\n";
	}
	$count++;
}
print "Direct replacements: $count\n";

$count = 0;
foreach $i (sort keys %suggested)
{	foreach $j (sort @{$suggested{$i}})
	{	print OBSALLFILE "$i\t$j\n";
	}
	$count++;
}
print "Indirect replacements: $count\n";

foreach $i (sort keys %old)
{	if (exists($secondary{$i}))
	{	delete $old{$i};
	}
	elsif ($old{$i} == 10)
	{	#new term
	}
	else
	{	if ($old{$i} == 11)
		{	print "Def lost: $i\n";
		}
		if ($old{$i} < 10)
		{	print "Term lost: $i\n";
		}
		if ($old{$i} > 11)
		{	print "Script error: $i\n";
		}
	}
}

foreach $i (sort keys %red)
{	if ($red{$i} == 1)
	{	print "Redundant relationship: $i, could be removed\n";
	}
}

foreach $i (sort keys %extraRef)
{	print "Ref in ontology file: $i, $extraRef{$i}\n";
}

foreach $i (sort keys %refReplaced)
{	print "Ref replaced: $i, $refReplaced{$i}\n";
}

foreach $i (sort keys %nameTwice)
{	print "Duplicated name: $i, $nameTwice{$i}\n";
}

foreach $i (sort keys %synTerm)
{	if ($synTerm{$i} =~ /12/)
	{	@oldSyn = @{$syn{$i}};
		@newSyn = @{$syns{$i}};
		my %tracker = ();
		$tracker{$_} .= 1 for @oldSyn;
		$tracker{$_} .= 2 for @newSyn;
		for (sort keys %tracker)
		{	if ($tracker{$_} !~ /1/)
			{	push(@{$newSynz{$i}}, $_);
			}
			elsif ($tracker{$_} !~ /2/)
			{	push(@{$synzLost{$i}}, $_);
			}
		}
	}
	elsif ($synTerm{$i} !~ /2/)
	{	@{$synzLost{$i}} = @{$syn{$i}};
	}
	else #new synonyms
	{	@{$newSynz{$i}} = @{$syns{$i}};
	}
}

print "Update synonyms now?\n";
$response = <STDIN>;
if ($response =~ /y/)
{	$yes = 1;
}
else
{	$yes = 0;
}

print "\nSynonyms gained...\n";
foreach $i (sort keys %newSynz)
{	foreach $j (@{$newSynz{$i}})
	{	if ($yes == 1)
		{	print "$i: $name{$i} ? $j\n";
			print "Please suggest a relationship type:\n=\t!=\t<\t>\t~\n";
			$response = <STDIN>;
			chomp($response);
			while ($response !~ /(\=|\!=|\<|\>|\~|\?)/)
			{	print "No type/incorrect type entered. Please enter a relationship type.\n";
				$response = <STDIN>;
				chomp($response);
			}
			$type{$i.$j} = $response;
		}
		else
		{	$type{$i.$j} = "?";
		}
	}
}

print "\nSynonyms lost forever...\n";
foreach $i (sort keys %synzLost)
{	foreach $j (@{$synzLost{$i}})
	{	print "LOST: $j, $i\n";
	}
}

foreach $i (sort keys %syns)
{	foreach $j (@{$syns{$i}})
	{	print SYNOUTFILE "$i\t$name{$i}\t$type{$i.$j}\t$j\n";
	}
}

rename($SynFile, "go/doc/synonyms/oldSynonyms.txt");
rename($SynOutFile, $SynFile);

#
#
#	SPELLCHECKER!
#	new SpellChecker code
#
#
foreach $i (sort keys %word)
{	#if ($word{$i} == 1)
	#{	print "$i\n";
	#}
	while ($i =~ /(.*?)[\/\:](.*)/g)
	{	delete $word{$i};
		$word{$1}++;
		$i = $2;
#		print "$i\n";
	}
}

close(INFILE);
close(LOGFILE);
exit(0);

sub refFormatter
{	if (exists($Ref{$ref}))
	{	return $Ref{$ref};
	}

	if ($ref =~ /([TE]C)[:| ](.*)/)
	{	$letter = $1;
		$data = $2;
		$null = 0;
		if ($letter =~ /T/)
		{	$tc = 1;
			if ($ref =~ /TC:[1-9\-]\.[A-E\-]\.(\d{1,}|-)\.(\d{1,}|-)\.(\d{1,}|-)/)
			{	return $ref;
			}
			else
			{	print "Error with TC ref $ref\n";
			}
		}
		else
		{	$tc = 0;
			if ($ref =~ /EC:[1-6\-]\.(\d{1,}|-)\.(\d{1,}|-)\.(\d{1,}|-)/)
			{	return $ref;
			}
			else
			{	print "Error with EC ref $ref\n";
			}
		}
			
		$data =~ s/--/-/g;
		$data =~ s/\.\./\./g;
		$data =~ s/(\d)-/$1/g;
			
		if ($tc == 1)
		{	#	grab the first three characters. Should be number dot letter
			$first = substr($data, 0, 3);
			$last = substr($data, 3);
			if ($first =~ /[^1-9]\.[^A-E]/)
			{	if ($first =~ /[1-9\-]\.\-/)
				{	#	formatting is fine but set the null flag
					$null = 1;
				}
				else
				{	print "Error with TC reference: TC:$data\n";
					return $ref;
				}
			}
		}
		else
		{	#	grab the first character. It should be a number (1-6)
			$first = substr($data, 0, 1);
			$last = substr($data, 1);
			if ($first !~ /[1-6]/)
			{	if ($first eq "-")
				{	#	formatting fine, set null flag
					$null = 1;
				}
				else
				{	print "Error with first value in EC number: EC:$data\n";
					return $ref;
				}
			}
		}

		#	if it's a null reference, return it now to save any further bother with it
		if ($null == 1)
		{	$ref = $letter.":".$first.".-.-.-";
			return $ref;
		}
		else
		{	$i = 0;
			if (substr($last, -1, 1) eq ".")
			{	chop($last);
			}

			while ($last =~ /\.(\d{1,}|-)/g)
			{	$i++;
			}
			if ($i < 3)
			{	while ($i != 3)
				{	$last .= ".-";
					$i++;
				}
				#	print "Reformatted: $letter:$first$last\n";
				$Ref{$ref} = $letter.":".$first.$last;
				return $Ref{$ref};
			}
			else
			{	#	print "dbxref format correct: $letter:$first$last\n";
				$Ref{$ref} = $letter.":".$first.$last;
				return $Ref{$ref};
			}
		}
	}
	
	elsif ($ref =~ /(PMID|ISBN)/)
	{	$ref =~ s/[-\.\s]//g;
	}
	elsif ($ref =~ /GO:(.*)/)
	{	if ($1 =~ /[A-Z]/)
		{	$ref = "GO:".lc($1);
		}
		if ($1 =~ /\d*/)
		{	if (length($1)-7 != 0)
			{	print "Incorrect GOid length $ref\n";
			}
		}
	}

	return $ref;
}
