#!usr/bin/perl -w
#	
# By J. Clark and T. Deegan
# January 2005
#
# This script can be used to add a subset line for each term that is to be part of the GO subset. 
# To run it you need to have a file called 'numbers' with the list of term numbers that you want to be in the subset, and 
# you need an up to date copy of the ontologies in obo format.
# Before running the script you need to alter line 83 to show what the subset line is to say.
# after running the script you need to add a line to the header section of the ontology file
# to show that there is a new subset that has been added.

use strict;

#invent a hash table for all the numbers in the numbers file 
#and initialise it.
my %hash_of_numbers = ();      

#In this state the computer is interested in the go term it's reading about.
my $interested;

#for storing lines in later
my $line;

# Open the numbers file for reading
open (FILE, "<numbers") || die "Can't open numbers.\n";  

#do what's in the braces to every line in the file one line at a time.
while(<FILE>){	

    #take the line breaks off.
    chomp;

    #$_ is the contents of one line. Each time read $_ into the hash and 
    #for that number enter a 1 in the hash table.
    $hash_of_numbers{$_} = 1;

#end of 'while' thing.
}	

#we are done with this file now.
close FILE;	

#open the ontology file for reading.
open (FILE, "gene_ontology.obo") || die "Can't open gene_ontology.obo.\n";

#at the beginning the computer is not interested, it is just reading.
$interested=0;

#start reading the lines one at a time.
while(<FILE>){

    #store the line we've read in a variable for safe keeping.
    $line = $_;

    #this if line is the switch that decides, based on whether we're interested, what we do.
    if ($interested==0){ 
	#within this block we are currently bored but will check the line to see if it is interesting
	# and in any case print it. If the line is interesting then we will be interested in the next line.
	#read the line to see if it has an interesting id, and whether it does or not, print the line. 
       	print $line;

	#If we see an interesting id then set the flag to 'interesting=1'
       	if (($line=~m/^id: (GO:[0-9]+)$/)           # this is an ID line
	    && (defined $hash_of_numbers{$1}))       # and the ID number is in our hash table
	    {
            # This is one of the terms we have to alter: we now become interested.
	    $interested = 1;
	}


    } else {
	#if the switch decided we were interested (we're looking at a a GO term that's listed in the hash table.)
	#then here's what to do written in this block.
	
	#this if tests to see if this is one of the lines that goes between teh id line and the 'subset:' line.
	if ($line=~m/^((is_anonymous)|(name)|(alt_id)|(namespace)|(def)|(comment)): /){
	    #if it is one of those in between lines then just print it and carry on.
	    print $line;
	}
	else {
	    #we have found the place to put the subset line. 
	    print "subset: prok\n";
	    #now stop being interested until another GO:id from the numbers file appears.
	    $interested=0;
	    print $line;
	}
	 
	#this is the end of the block that started with deciding whether or not we were interested in the line.
    }
#end of while
}


    


