#!/usr/bin/perl -w # OntologyChecks.pl # # WARNING: script not finished - work in progress! # # Checks done: # Defs file # - conflicts # - comments format # - illegal characters # - first letter not capitalized # - last letter not full stop # - EC number formatting # - double colons # - obsolete formatting # # Ontology files # - conflicts # - terms with same name, diff GOids # - secondary ID format # - , , XX: # - terms moved to diff ontology # - terms in 1+ ontology # - redundant rlnships (inc signs) # - repeated is-a/part-of rlnships # - bad dbx formatting # - ISBN/PMID/GO as general dbxref # - obs term rlnships # - obs terms elsewhere in the ontology # - terms or definitions lost without a trace # # Files needed: # - copies of the current ontology files, stored in go/ontology/ # - copies of an old set of ontology files so the script can check for # lost terms, etc., stored in go/ontology/old/ # - copy of the defs file, stored in go/doc/GO.defs # - an old defs file, stored in go/doc/old/GO.defs # # Checks.pl creates a data file with a brief summary of old ontology # contents. It compares this to the current state of the ontologies and # reports on the changes. It creates a new version of the ontologies # with some of the above mistakes corrected and the others documented. # # New additions: # Added Synonymizer.pl functionality (finds the synonyms and prompts to update # the synonyms file) # # Checks for secondary IDs: # - secIDs don't match extant terms # - secIDs aren't shared by more than one term / move between terms # # Checks OBSOLETE not in defs of non-obsolete terms # # Script by Amelia Ireland # Comments, bugs, complete-lack-of-functionality complaints to # aji@ebi.ac.uk # # TO DO: # - code to isolate completely loopy comments # - check syns/refs always followed by a reference ID (is this necessary?) # - update redundancy code to remove redundancies (may not be worthwhile...) # - check whether or not the synonyms and obsoletes need to be updated in CVS # - add the Dictionary to this script # #if (-e $dataFile) #{ # files required by the program $dataFile = "go/data.txt"; @Ontologies = ("go/ontology/component.ontology", "go/ontology/function.ontology", "go/ontology/process.ontology"); $DefFile = "go/doc/GO.defs"; $SynFile = "go/doc/synonyms/Synonyms.txt"; # new files created by the program $LogFile = "OntologyChecks Log"; @NewOntologies = ("go/ontology/newcomponent.ontology", "go/ontology/newfunction.ontology", "go/ontology/newprocess.ontology"); $DefOutFile = "go/doc/newGO.defs"; $SynOutFile = "go/doc/synonyms/newSynonyms.txt"; @OldOntologies = ("go/ontology/oldcomponent.ontology", "go/ontology/oldfunction.ontology", "go/ontology/oldprocess.ontology"); # create a new Synonyms mapping file open(SYNOUTFILE, '>'.$SynOutFile) or die "The file $SynOutFile could not be created.\n"; # opens the existing synonyms file and gets the synonym data open(SYNFILE, '<'.$SynFile) or die "The file $SynFile could not be found.\n"; while () { $Line = $_; if ($Line !~ /^!/) { $Line =~ s/"//g; if ($Line =~ /(GO:\d{7})\t.*?\t(.*?)\t(.*)/) { $goid = $1; $string = $3; $sign = $2; $synTerm{$goid} = 1; $string =~ s/(.*?)\t.*/$1/; push(@{$syn{$goid}}, $string); $type{$goid.$string} = $sign; if ($sign !~ /^([=\<\>\~]|!=)$/) { # print "$Line"; push(@{$newSynz{$goid}}, $string); } } } else { print SYNOUTFILE "$Line"; } } %ontology = (0 => 'cellular component', 1 => 'molecular function', 2 => 'biological process'); %obsId = (0 => '0008370', 1 => '0008369', 2 => '0008371'); %Ont = (0 => 'CO', 1 => 'FO', 2 => 'PO'); open(LOGFILE, '>'.$LogFile) or die "The file $LogFile could not be created.\n"; $dataFile = "go/data.txt"; if (-e $dataFile) { open(DATA, '<'.$dataFile); while () { $Line = $_; # ont def obs goid name if (/([012])\t([01])\t([012])\t(GO:\d{7})\t(.*?)\t.*/) { $goid = $4; $name = $5; $old{$4} = 0; $ont{$4} = $1; $def{$4} = $2; $obs{$4} = $3; if ($name =~ /(GO:\d{7})/) { $secID{$goid} = $1; } else { $name{$goid} = $5; } } else { print "Error in data file!\n"; } } close(DATA); print "Loaded old data file.\n"; } $error = ""; $ont = 0; foreach $InFile (@Ontologies) { open(INFILE, '<'.$InFile) or die "The file $InFile could not be found.\n"; $OutFile = $NewOntologies[$ont]; open(OUTFILE, '>'.$OutFile) or die "The file $OutFile could not be created.\n"; $LineNo = 1; $lastGoid = ""; $lastNest = 0; %term = (); @path = (); print "Loading current $ontology{$ont} ontology...\n"; while () { $Line = $_; $core = $_; if ($core =~ /(.*?[<%].*? )[<%].*/) { $core = $1; } $obs = 0; # Ontology file conflicts if ((/<<>>/)) { print "Error - conflict found in the ontology.\n"; print LOGFILE "$Ont{$ont} $LineNo: conflict marker in ontology.\n"; # !saved-by: # !date: # !version: } if ($Line =~ /^(\s{1,})([<%])(.*?) ; (GO:\d{7})/) { $thisNest = length($1); $goid = $4; $term = $3; $sign = $2.$goid; $s = $2; $path = ""; $term =~ s/\\//g; # while ($term =~ /(\S+)\s*/g) # { $w = $1; # if ($w =~ /[A-Za-z]/) # { $word{$w}++; # } # } $new{$goid} = 1; # Ontology file repeated is-a/part-of rlnships if ($goid eq $lastGoid) { print "Repeated line: $LineNo, $goid.\n"; print LOGFILE "$Ont{$ont} $LineNo: repeated line. $ontology{$ont} ontology\n"; } $lastGoid = $goid; $a = $lastNest - $thisNest; $a++; while ($a != 0) { #$b = pop(@path); pop(@path); $a--; } foreach $i (@path) { if ($i =~ /$obsId{$ont}/) { $obs = 1; # Ontology file incorrect obsolete term rlnships if ($s eq "<") { print "Obsolete relationship needs to be changed: $goid.\n"; print LOGFILE "$Ont{$ont} $LineNo: obsolete relationship needs to be changed: $goid\n"; $error .= $ont; if ($Line =~ /\A(\s{1,})<(.*)/) { $Line = $1."%".$2."\n"; } } } $path .= $i; } $path .= $s; push(@path, $sign); $lastNest = $thisNest; # Ontology file redundant rlnships (inc signs) if (exists($paths{$goid})) { @tempList = @{$paths{$goid}}; foreach $i (@tempList) { $t = chop($i); # but don't save these changes! if (($i =~ /$path/) || ($path =~ /$i/)) { if ($s eq $t) { $red{$goid} = 1; #these could be deleted but only if it was the line just checked that was redundant } else { $red{$goid} = 2; } } } $path .= $s; push(@{$paths{$goid}}, $path); } else { $path .= $s; $paths{$goid} = [ $path ]; } #--------START IMPORTANT BIT #the meaty chunks bit if (exists($name{$goid})) #it's not a new term { # Ontology file terms moved to diff ontology # Ontology file terms in 1+ ontology # # How are we going to store this info? Want to know which ontologies it's in. # Also know which ontology it should be in - it should be in the original ontology # and removed from the one it has been added to. Wouldn't be able to tell if term # is new. if ($ont{$goid} != $ont) { print "Error - $goid in more than one ontology.\n"; } #check for obsoletion errors if ($obs{$goid} != $obs) { if ($obs{$goid} == 0) # the term has been obsoleted { $obs{$goid} = 1; } # Ontology file obs terms elsewhere in the ontology elsif ($obs{$goid} == 1) # the term was/is obsolete but is active! Uh-oh! { print "Error! Obsolete term in active ontology! $goid\n"; print LOGFILE "Obsolete term in active ontology, $goid\n"; $obs{$goid} = 2; # need to change this when the error is sorted out. } } if (!exists($done{$goid})) { $done{$goid} = 1; $old{$goid} += 10; #check for term name changes if ($name{$goid} ne $term) { #printf "Term name change: %s --> $term\n", $name{$goid}; $name{$goid} = $term; } # check that the term is not also a secondary ID if (exists $secID{$goid}) { print "Existing term $goid is also a secondary ID of $secID{$goid}.\n"; } # Ontology file terms with same name, diff GOids #check for identical term names if ($obs != 1) { if ((exists($nameCheck{$term})) && ($nameCheck{$term} ne $goid)) { # if ($term ne "obsolete") { print "Name repeated: $term, $nameCheck{$term} and $goid\n"; if (exists($nameTwice{$term})) { if ($nameTwice{$term} !~ /$goid/) { $nameTwice{$term} .= ", ".$goid; } } else { $nameTwice{$term} = $nameCheck{$term}.", ".$goid; } } else { $nameCheck{$term} = $goid; } } } } else #a completely brand new term! { $ont{$goid} = $ont; $name{$goid} = $term; $obs{$goid} = $obs; $def{$goid} = 0; if (($ont == 1) && ($obs == 0)) { if ($term !~ /activity/) { print "Possible error with new function term: $term\n"; } } # Ontology file terms with same name, diff GOids if ($obs != 1) { if ((exists($nameCheck{$term})) && ($nameCheck{$term} ne $goid)) { # if ($term ne "obsolete") { print "Name repeated: $term, $nameCheck{$term} and $goid\n"; $nameTwice{$term} .= ", ".$goid; if ($ont{$nameCheck{$term}} == $ont{$goid}) { print "Same ontology.\n"; } # } } else { $nameCheck{$term} = $goid; } } $done{$goid} = 1; } #--------END IMPORTANT BIT } # find the synonyms for the synonyms script if (/synonym/ and !exists $synDone{$goid} and $obs == 0) { $ref = $Line; $ref =~ s/\\//g; while ($ref =~ /synonym:(.*?)($| ; .*| [<%])/g) { # synonym formatting error - secondary ID as a synonym $synTerm{$goid} .= 2; if ($1 =~ /(GO:\d{7})/) { $error .= $ont; print "$Ont{$ont} $LineNo: Formatting error - secondary ID\n"; print LOGFILE "$Ont{$ont} $LineNo: synonym formatting error\n"; push(@synonymFormat, $1); #reformat $Line =~ s/ ; synonym:$1//g; $Line =~ s/$goid/$goid, $1/g; } else { $synonym = $1; push(@{$syns{$goid}}, $synonym); while ($synonym =~ /(\S+)\s*/g) { $w = $1; if ($w =~ /[A-Za-z]/) { $word{$w}++; } } } $ref = $2; } $synDone{$goid} = 1; } # Error: GOID as a general dbxref if (/(GO:\d{7}) ; (GO:\d{7})/) { $error .= $ont; print "$Ont{$ont} $LineNo: Formatting error - secondary ID\n"; print LOGFILE "$Ont{$ont} $LineNo: GOID as general dbxref\n"; push(@synonymFormat, $1); #offer to reformat $Line =~ s/$1 ; $2/$1, $2/g; } # start SECONDARY ID CODE #add the secondary IDs if ($core =~ /GO:\d{7}, (GO:\d{7}.*)/) { $j = $1; while ($j =~ /(GO:\d{7})/g) { # check the secondary ID against what was in the data file if (exists $secID{$1} and $secID{$1} ne $goid) { print "$1 is a secondary ID for $goid (current) and $secID{$1} (old).\n"; } # check that the secondary ID does not exist already in the ontologies as a secID for something else if (exists $secondary{$1} and $secondary{$1} ne $goid) { print "$1 is a secondary ID for $goid and $secondary{$1} in current files.\n"; } $secondary{$1} = $goid; } } # end SECONDARY ID CODE # Ontology file , , XX: if (/\>/) { print "Error! \> in ontology!\n$Line"; print LOGFILE "$Ont{$ont} $LineNo: \> in ontology: check for new synonyms, terms or dbxrefs\n"; # removes any blank references straight away if (/:\\//g; $Line =~ s/ ; synonym:\\//g; } if (/\\ ; (GO:\d{7})/) { # we're in trouble! print "No name: $1\n$Line"; # push(@noName, $1); # either enter a name or remove all refs to it? } print "$Line"; } # Ontology file ISBN/PMID/GO as general dbxref # check for incorrect dbxrefs if (/(PMID:.*|ISBN:.*|GO:\D{1,}.*)/) { $ref = $1; $ref =~ s/ [<;%].*//g; $error .= $ont; $ref = refFormatter($ref); print "$Ont{$ont} $LineNo: incorrect general dbxref: $goid, $ref\n$Line"; print LOGFILE "$Ont{$ont} $LineNo: incorrect general dbxref: $goid, $ref\n"; if ((exists($def{$goid})) && ($def{$goid} == 1)) { $Line =~ s/ ; $ref//g; $extraRef{$goid} = $ref; } else { print "Incorrect dbxref, cannot be moved: $goid, $ref\n$Line"; $extraRef{$goid} = $ref; } } # EC / TC ref formatting if (/; ([ET]C:.*)/) { $ref = $1; $ref =~ s/ [<;%].*//g; $newRef = refFormatter($ref); if ($newRef ne $ref) { print "Reformatted $ref to $newRef\n"; print LOGFILE "Reformatted $ref to $newRef\n"; $Line =~ s/$ref/$newRef/g; $error .= $ont; } } print OUTFILE "$Line"; $LineNo++; } $ont++; print "done.\n" } for ($i = 0; $i < 3; $i++) { if ($error !~ /$i/) { unlink($NewOntologies[$i]); } else { rename($Ontologies[$i], $OldOntologies[$i]); rename($NewOntologies[$i], $Ontologies[$i]); } } # check the secondary IDs. Go through the old secIDs and check whether any have been lost or reinstated foreach $i (keys %secID) { if ($secID{$i} =~ /GO\d{7}/) { if (!exists $secondary{$i}) { print "secondary ID $i (was secID to $secID{$i}) "; if (exists $new{$i}) { print "has been reinstated as a term\n"; } else { print "lost\n"; } } } delete $secID{$i}; } # check the new secondary IDs don't have any erroneous data and add them to the list of terms that appear in the new files foreach $i (keys %secondary) { if (exists $name{$i} and exists $new{$i}) { print "Error! Name for sec ID $i exists!\n"; } $name{$i} = $secondary{$i}; $ont{$i} = $ont{$secondary{$i}}; $obs{$i} = $obs{$secondary{$i}}; $def{$i} = $def{$secondary{$i}}; $new{$i} = 1; } $error = 0; @comm = ('This term was made obsolete', 'This term was split from', 'See also', 'Note that'); $obs = 'OBSOLETE (was not defined before being made obsolete).'; $obsDef = 'OBSOLETE. '; open(DEFFILE, '<'.$DefFile) or die "The file $DefFile could not be found.\n"; open(DEFOUTFILE, '>'.$DefOutFile) or die "The file $DefOutFile could not be created.\n"; print "Opening GO.defs file.\n"; while () { $Line = $_; chomp($Line); # Look for strange characters if ((/<<<<<<>>>>>>/)) { print "Error - conflict found in the ontology.\n"; } if (/[\<\>\{\}\&\|\?\"\`\t\\]/) { unless (/[\d| ]->[\d| ]/ or /http:/ or /-\|-/) { print LOGFILE "possible illegal character error $goid: \"$Line\"\n\n"; } } if (/[\x80-\xFF]/) { print LOGFILE "High-ASCII illegal character error $goid: \"$Line\"\n\n"; } if (/ - /) { print LOGFILE "possible illegal symbol string $goid: \"$Line\"\n\n"; } if (/term: (.*)/) { $termName = $1; while ($termName =~ /(\S+)\s*/g) { $w = $1; if ($w =~ /[A-Za-z]/) { $word{$w}++; } } } elsif (/goid: (.*)/) { $refFlag = 0; $goid = $1; $def{$goid} = 1; $obsFlag = $obs{$goid}; if ($termName ne $name{$goid}) { print "Error : term name inconsistency! $goid, $name{$goid}\n"; } if (exists($extraRef{$goid})) { $refFlag = 1; } if (exists($old{$goid})) { if ($old{$goid} < 10) { print "Error - term missing but definition still present: $goid\n"; } else { delete $old{$goid}; } } } elsif (/definition: (.*)/) { $def = $1; $oldDef = $def; $def =~ s/\\n/ /g; $def =~ s/\.([A-Za-z])/\. $1/g; $def =~ s/ / /g; if ($def =~ /^[a-z]/) { print "First letter not capitalized: $goid.\n"; $def = ucfirst($def); print LOGFILE "First letter capitalized: $goid.\n"; $error = 1; } if ($def =~ /[\[\]\{\}]/) { $temp = $def; $temp =~ s/[\[\{]/$/g; $temp =~ s/[\]\}]/$/g; $left = 0; $right = 0; while ($temp =~ /$/g) { $left++; } while ($temp =~ /$/g) { $right++; } if ($left != $right) { print "Possible error with brackets: $goid.\n"; print LOGFILE "Possible error with brackets: $goid.\n"; } } if ($def !~ /\.$/) { print "Definition did not end with a full stop: $goid:\n"; $error = 1; while (substr($def, -1, 1) eq " ") { chop($def); } $a = substr($def, -1, 1); if ($a =~ /[,;:]/) { chop($def); $def .= "."; } elsif ($a =~ /\)/) { $def .= "."; } elsif ($a !~ /\./) { $def .= "."; } print LOGFILE "Definition given full stop at end: $goid.\n"; } while ($def =~ /(\S+)\s*/g) { $w = $1; if ($w =~ /[A-Za-z]/) { $word{$w}++; } } # if the term is obsolete, check that the def contains the word obsolete somewhere # need to alter this check so it looks at previous data to see if the def has 'obsolete' in it. if (exists($obs{$goid}) and $obs{$goid} != 0) { if (($def ne $obs) && (substr($def, 0, 10) ne $obsDef)) { print "Error in obsolete formatting: $goid.\n$Line\n"; print LOGFILE "Error in obsolete formatting: $goid.\n$Line\n"; if ($def =~ /defined/) { $def = $obs; } else { #FIX! print "Replace? Choose an option (n does nothing):\n1: OBSOLETE (undefined) 2: add OBSOLETE to existing def 3: Suggest a new def.\n"; $response = ; chomp($response); if ($response eq "1") { $def = $obs; } elsif ($response eq "2") { $def = $obsDef.$def; } elsif ($response ne "n") { $def = $response; } } } } if ($def =~ m/obsolete/i and $obs{$goid} == 0) { print "Error - non obsolete term $goid has obsolete in definition!\n"; } if ($def ne $oldDef) { $error = 1; } $Line = "definition: $def"; } elsif (/definition_reference: (.*)/) { $ref = $1; $origRef = $1; $ref =~ s/::/:/g; while (substr($ref, -1, 1) eq " ") { $ref = substr($ref, 0, length($ref)-1); } #EC ref formatting if (/([ET]C:.*|ISBN.*|PMID.*)/) { $ref = refFormatter($1); } if ($refFlag == 1) { #check if the ref is the same if ($ref eq $extraRef{$goid}) { $refFlag = 0; } } if ($origRef ne $ref) { print "Reference reformatted: $goid\n"; print LOGFILE "Reference reformatted: $goid\n"; $error = 1; } $Line = "definition_reference: $ref"; } elsif (/comment: (.*)/) { $obsFlag = 0; if ($refFlag == 1) { print DEFOUTFILE "definition_reference: $extraRef{$goid}\n"; print "Adding dbxref $extraRef{$goid} to $goid.\n"; $error = 1; $refFlag = 0; } $comment = $1; $oldComment = $comment; $comment =~ s/\\n/ /g; $comment =~ s/ / /g; $comment =~ s/\.'/'\./g; if ($comment =~ /^[a-z]/) { print "First letter of comment not capitalized: $goid.\n"; print LOGFILE "First letter of comment not capitalized: $goid.\n"; $comment = ucfirst($comment); $error = 1; } if ($comment !~ /\.$/) { print "Comment did not end with a full stop: $goid:\n"; print LOGFILE "Comment did not end with a full stop: $goid.\n"; $error = 1; while (substr($comment, -1, 1) eq " ") { chop($comment); } if (substr($comment, -1, 1) =~ /[,;:]/) { chop($comment); $comment .= "."; } elsif (substr($comment, -1, 1) =~ /\)/) { $comment .= "."; } elsif (substr($comment, -1, 1) !~ /\./) { $comment .= "."; } } #To update annotations, use the [molecular function|biological process|cellular component] term '[text] ; GO:[id]'. #To update annotations, consider the [molecular function|biological process|cellular component] terms '[text1] ; GO:[id1]', '[text2] ; GO:[id2]' and '[text3] ; GO:[id3]' (and its child[ren]). # code to spot errors in the suggesting alternate terms bit... if ($comment =~ /To update annotations, (use|consider) the (.*?\.)/i) { $alt = $2; $use = 0; #if (($1 ne 'use') && ($1 ne 'consider')) #{ print "$1 not used or considered\n"; #} if ($1 eq 'use') { $use = 1; # if ($alt =~ / node /) # { print "Error - $alt\n"; # } } $i = 0; while ($alt =~ /(GO:\S*)/g) { $i++; # push(@temp, $1); } $j = 0; while ($alt =~ /'/g) { $j++; } if ($j != $i * 2) { print "$goid : Error with number of 's - $j, $i\n"; print "$alt\n"; print "Ignore (I) or add?\n"; $response = ; chomp($response); while ($response eq "") { print "No response. Please try again.\n"; $response = ; chomp($response); } if ($response ne "i") { $alt = $response; } } #foreach $i (@temp) #{ #find out what ontology they're in #} while ($alt =~ /\'(.*?); (GO:\S*?)\'/g) { $refName = $1; $refGoid = $2; if ($refGoid =~ /(GO:\d{7})/) { $refGoid = $1; if (exists $secondary{$refGoid}) { print "Error: $refGoid is a synonym for $secondary{$refGoid}\n"; print LOGFILE "Replaced secondary GOID $refGoid with $secondary{$refGoid}.\n"; $comment =~ s/$refGoid/$secondary{$refGoid}/; $refGoid = $secondary{$refGoid}; } if (exists $obs{$refGoid} and $obs{$refGoid} != 0) { print "Error, $goid: suggested term $refGoid is obsolete!\n"; } if (exists $name{$refGoid} and $refName ne $name{$refGoid}." ") { print "Error with $refGoid term name in comment for $goid\n$name{$refGoid}, $refName.\n"; print LOGFILE "Comment changed for $goid ($refGoid had incorrect term name)\n"; $comment =~ s/$refName; $refGoid/$name{$refGoid} ; $refGoid/g; } push(@{$referenced{$refGoid}}, $goid); if ($use == 1) { push(@{$replacement{$goid}}, $refGoid); } else { push(@{$suggested{$goid}}, $refGoid); } if (($i == 1) && ($alt !~ /its child/)) { if ($alt ne "$ontology{$ont{$refGoid}} term '$name{$refGoid} ; $refGoid'.") { print "error with ontology string\n$comment\n"; $comment =~ s/$alt/$ontology{$ont{$refGoid}} term '$name{$refGoid} ; $refGoid'./; print LOGFILE "Comment for $goid incorrect; replaced $alt with $ontology{$ont{$refGoid}} term '$name{$refGoid} ; $refGoid'.\n"; } } else { #print "Comment ignored: $goid, $alt\n"; } } else { print "$refGoid is not a proper GO term - beware!\n"; } } } else { $flag = 0; foreach $i (@comm) { if ($comment =~ /\A$i/) { $flag = 1; } } if ($flag == 0) { print "Comment formatting error: $goid\n$Line\n"; print LOGFILE "Comment formatting error: $goid\n"; } } if ($oldComment ne $comment) { $error = 1; } $Line = "comment: ".$comment; while ($comment =~ /(\S+)\s*/g) { $w = $1; if ($w =~ /[A-Za-z]/) { $word{$w}++; } } } elsif ($Line eq "") { if ($refFlag == 1) { print DEFOUTFILE "definition_reference: $extraRef{$goid}\n"; print "Adding dbxref $extraRef{$goid} to $goid.\n"; print LOGFILE "Added dbxref $extraRef{$goid} to $goid.\n"; } if ($obsFlag == 1) { print "Missing comment for obsolete term $goid.\n"; print LOGFILE "Missing comment for obsolete term $goid.\n"; $obsFlag = 0; } } print DEFOUTFILE "$Line\n"; } foreach $i (sort keys %obs) { if ($obs{$i} != 0) { push(@Obs, $i); } } foreach $i (@Obs) { if ($def{$i} == 0) { print "No def for obsolete term $i. Add one?\n"; $response = ; chomp($response); if ($response eq "y") { print OUTFILE "term: $name{$i}\ngoid: $i\ndefinition: $obs\ndefinition_reference: GO:curators\n\n"; $error = 1; } else { print "No definition added.\n"; } } } $count = 0; if ($error == 0) { unlink($DefOutFile); } else { rename($DefFile, "go/doc/oldGO.defs"); rename($DefOutFile, $DefFile); } ### Reprinting the DATA file ### open(DATA, '>'.$dataFile) or die "The file $dataFile could not be created.\n"; foreach $i (sort keys %new) { #if (!exists($obs{$i})) #{ $obs{$i} = 0; #} #if (!exists($def{$i})) #{ $def{$i} = 0; #} print DATA "$ont{$i}\t$def{$i}\t$obs{$i}\t$i\t$name{$i}\t"; if ($obs{$i} != 0) { $count++; } if (exists($referenced{$i})) { foreach $j (sort @{$referenced{$i}}) { print DATA "$j"; } } print DATA "\n"; } print "Total obsoletes: $count\n"; $ObsDirFile = "go/doc/obsoletes-exact"; open(OBSDIRFILE, '>'.$ObsDirFile) or die "The file $ObsDirFile could not be created.\n"; $ObsAllFile = "go/doc/obsoletes-inexact"; open(OBSALLFILE, '>'.$ObsAllFile) or die "The file $ObsAllFile could not be created.\n"; print OBSDIRFILE "!version: \$Revision: 1.6 $\n!date: \$Date: 2003/05/30 14:08:57 $\n!Obsolete terms and direct annotation substitutes\n!\n!Obsolete\tAlternative\n"; print OBSALLFILE "!version: \$Revision: 1.6 $\n!date: \$Date: 2003/05/30 14:08:57 $\n!Obsolete terms and possible annotation substitutes\n!\n!Obsolete\tAlternative\n"; $count = 0; foreach $i (sort keys %replacement) { foreach $j (sort @{$replacement{$i}}) { print OBSDIRFILE "$i\t$j\n"; } $count++; } print "Direct replacements: $count\n"; $count = 0; foreach $i (sort keys %suggested) { foreach $j (sort @{$suggested{$i}}) { print OBSALLFILE "$i\t$j\n"; } $count++; } print "Indirect replacements: $count\n"; foreach $i (sort keys %old) { if (exists($secondary{$i})) { delete $old{$i}; } elsif ($old{$i} == 10) { #new term } else { if ($old{$i} == 11) { print "Def lost: $i\n"; } if ($old{$i} < 10) { print "Term lost: $i\n"; } if ($old{$i} > 11) { print "Script error: $i\n"; } } } foreach $i (sort keys %red) { if ($red{$i} == 1) { print "Redundant relationship: $i, could be removed\n"; } } foreach $i (sort keys %extraRef) { print "Ref in ontology file: $i, $extraRef{$i}\n"; } foreach $i (sort keys %refReplaced) { print "Ref replaced: $i, $refReplaced{$i}\n"; } foreach $i (sort keys %nameTwice) { print "Duplicated name: $i, $nameTwice{$i}\n"; } foreach $i (sort keys %synTerm) { if ($synTerm{$i} =~ /12/) { @oldSyn = @{$syn{$i}}; @newSyn = @{$syns{$i}}; my %tracker = (); $tracker{$_} .= 1 for @oldSyn; $tracker{$_} .= 2 for @newSyn; for (sort keys %tracker) { if ($tracker{$_} !~ /1/) { push(@{$newSynz{$i}}, $_); } elsif ($tracker{$_} !~ /2/) { push(@{$synzLost{$i}}, $_); } } } elsif ($synTerm{$i} !~ /2/) { @{$synzLost{$i}} = @{$syn{$i}}; } else #new synonyms { @{$newSynz{$i}} = @{$syns{$i}}; } } print "Update synonyms now?\n"; $response = ; if ($response =~ /y/) { $yes = 1; } else { $yes = 0; } print "\nSynonyms gained...\n"; foreach $i (sort keys %newSynz) { foreach $j (@{$newSynz{$i}}) { if ($yes == 1) { print "$i: $name{$i} ? $j\n"; print "Please suggest a relationship type:\n=\t!=\t<\t>\t~\n"; $response = ; chomp($response); while ($response !~ /(\=|\!=|\<|\>|\~|\?)/) { print "No type/incorrect type entered. Please enter a relationship type.\n"; $response = ; chomp($response); } $type{$i.$j} = $response; } else { $type{$i.$j} = "?"; } } } print "\nSynonyms lost forever...\n"; foreach $i (sort keys %synzLost) { foreach $j (@{$synzLost{$i}}) { print "LOST: $j, $i\n"; } } foreach $i (sort keys %syns) { foreach $j (@{$syns{$i}}) { print SYNOUTFILE "$i\t$name{$i}\t$type{$i.$j}\t$j\n"; } } rename($SynFile, "go/doc/synonyms/oldSynonyms.txt"); rename($SynOutFile, $SynFile); # # # SPELLCHECKER! # new SpellChecker code # # foreach $i (sort keys %word) { #if ($word{$i} == 1) #{ print "$i\n"; #} while ($i =~ /(.*?)[\/\:](.*)/g) { delete $word{$i}; $word{$1}++; $i = $2; # print "$i\n"; } } close(INFILE); close(LOGFILE); exit(0); sub refFormatter { if (exists($Ref{$ref})) { return $Ref{$ref}; } if ($ref =~ /([TE]C)[:| ](.*)/) { $letter = $1; $data = $2; $null = 0; if ($letter =~ /T/) { $tc = 1; if ($ref =~ /TC:[1-9\-]\.[A-E\-]\.(\d{1,}|-)\.(\d{1,}|-)\.(\d{1,}|-)/) { return $ref; } else { print "Error with TC ref $ref\n"; } } else { $tc = 0; if ($ref =~ /EC:[1-6\-]\.(\d{1,}|-)\.(\d{1,}|-)\.(\d{1,}|-)/) { return $ref; } else { print "Error with EC ref $ref\n"; } } $data =~ s/--/-/g; $data =~ s/\.\./\./g; $data =~ s/(\d)-/$1/g; if ($tc == 1) { # grab the first three characters. Should be number dot letter $first = substr($data, 0, 3); $last = substr($data, 3); if ($first =~ /[^1-9]\.[^A-E]/) { if ($first =~ /[1-9\-]\.\-/) { # formatting is fine but set the null flag $null = 1; } else { print "Error with TC reference: TC:$data\n"; return $ref; } } } else { # grab the first character. It should be a number (1-6) $first = substr($data, 0, 1); $last = substr($data, 1); if ($first !~ /[1-6]/) { if ($first eq "-") { # formatting fine, set null flag $null = 1; } else { print "Error with first value in EC number: EC:$data\n"; return $ref; } } } # if it's a null reference, return it now to save any further bother with it if ($null == 1) { $ref = $letter.":".$first.".-.-.-"; return $ref; } else { $i = 0; if (substr($last, -1, 1) eq ".") { chop($last); } while ($last =~ /\.(\d{1,}|-)/g) { $i++; } if ($i < 3) { while ($i != 3) { $last .= ".-"; $i++; } # print "Reformatted: $letter:$first$last\n"; $Ref{$ref} = $letter.":".$first.$last; return $Ref{$ref}; } else { # print "dbxref format correct: $letter:$first$last\n"; $Ref{$ref} = $letter.":".$first.$last; return $Ref{$ref}; } } } elsif ($ref =~ /(PMID|ISBN)/) { $ref =~ s/[-\.\s]//g; } elsif ($ref =~ /GO:(.*)/) { if ($1 =~ /[A-Z]/) { $ref = "GO:".lc($1); } if ($1 =~ /\d*/) { if (length($1)-7 != 0) { print "Incorrect GOid length $ref\n"; } } } return $ref; }