#!/usr/bin/perl -w # =========================================================================== # # PUBLIC DOMAIN NOTICE # National Center for Biotechnology Information # # This software/database is a "United States Government Work" under the # terms of the United States Copyright Act. It was written as part of # the author's official duties as a United States Government employee and # thus cannot be copyrighted. This software/database is freely available # to the public for use. The National Library of Medicine and the U.S. # Government have not placed any restriction on its use or reproduction. # # Although all reasonable efforts have been taken to ensure the accuracy # and reliability of the software and data, the NLM and the U.S. # Government do not and cannot warrant the performance or results that # may be obtained by using this software or data. The NLM and the U.S. # Government disclaim all warranties, express or implied, including # warranties of performance, merchantability or fitness for any particular # purpose. # # Please cite the author in any work or product based on this material. # # =========================================================================== # # Author: Craig Wallin # # File Description: # # Accepts query as an argument # Gets document summary (DocSum) information from Entrez Gene via e-utils # Outputs either XML or a specified set of fields in tab-delimited format # use strict; # --------------------------------------------------------------------------- # use LWP::Simple; # Define library for the 'get' function. # --------------------------------------------------------------------------- # # Global variables my $verbose = 0; my $dopt; my $query = ""; my $beginTime = time(); my $startTag = ""; my $stopTag = "\n"; my @tags_to_find = ("geneId"); # start with geneId, add user's options to it # --------------------------------------------------------------------------- # # Main program # &processCommandLineParameters; &verifyInput; &printHeader if $dopt eq 'tab'; &process; &end; ################################################################################ # sub processCommandLineParameters { $dopt = ""; while(defined ($_= shift @ARGV)) { if ($_ eq '-h') { &showUsageAndExit; } elsif ($_ eq '-v') { $verbose = 1; } elsif ($_ eq '-q') { $query = shift @ARGV; } elsif ($_ eq '-t') { my $tag = shift @ARGV; push @tags_to_find, $tag; } elsif ($_ eq '-o') { $dopt = shift @ARGV; } else { # Ignore extra input } } if ($query eq "") {&showUsageAndExit} if ($dopt eq "") {&showUsageAndExit} } ################################################################################ # sub verifyInput { unless ($query) { print STDERR "query (-q) is required\n"; &showUsageAndExit; } unless (($dopt eq 'xml') or ($dopt eq 'tab')) { print STDERR "output (-o) is required and must equal 'xml' or 'tab'\n"; &showUsageAndExit; } } ################################################################################ # sub showUsageAndExit { my $usage = qq/ Usage: $0 [options] -q query -o xml|tab Options: -h Display this usage help information -v Verbose -q Query to run against Entrez Gene, e.g. "has summary[prop]" -o Output options xml - XML tab - tab-delimited -t Tag from eutils xml to extract, e.g. "Summary" - is case sensitive - may be specified multiple times to extract multiple tags & values - used only with "-o tab" option - to see all available xml tags in the DocSum, run first with "-o xml" option /; print STDERR "$usage\n\n"; &end } ################################################################################ # sub printHeader { print STDOUT join( "\t", @tags_to_find ) . "\n"; } ################################################################################ # sub process { my $geneStart = 0; my $maxGenes = 250; my $totalGenes; my $haveTotal = 0; my $geneId = ""; my $geneCount = 0; my $qs; my $esearch_result; my $esummary_result; my $first = 1; # Main loop: get up to maxGenes Gene ID's for requested query from Gene $totalGenes = $geneStart+1; # to get started GeneLoop: for (; $geneStart<$totalGenes; $geneStart+=$maxGenes) { if ($verbose) {print STDOUT "Processing genes $geneStart to ", $geneStart+$maxGenes-1, "\n";} #this option looks for GeneIDs with the $query value of interest $qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=$query"; $esearch_result = &Eutil ("esearch", $qs); if ($esearch_result =~ m/(\d+)<\/Count>/) { if ($haveTotal) { if ($totalGenes != $1) { die "esearch reported new total genes: was $totalGenes; now $1\nFor request $qs\n"; } } else { $totalGenes = $1; # extract total $haveTotal = 1; } } else { die "$esearch_result did not contain expected ,\n for request $qs\n"; } # Build querystring for GENE search $qs = "db=gene&id="; while ($esearch_result =~ m/(\d+)<\/Id>/g) { $geneCount++; $geneId = $1; # extract a geneId $qs .= "$geneId,"; } chop($qs); # remove last comma $qs .= "&filter=asis"; # Get Gene summary $esummary_result = &Eutil ("esummary", $qs); # Extract and output information for all genes if ($dopt eq 'tab'){ &extractAndOutput ($esummary_result); } else { &extractAndOutputXml ($esummary_result, $first); } $first = 0; if ($verbose) { print STDOUT "\ngeneCount: $geneCount\n"; } } # Close xml output if ($dopt eq 'xml') {print STDOUT "$stopTag\n"} } ################################################################################ # sub extractAndOutput { my $xml = $_[0]; my $match; my $geneId; my $name; my $description; while ($xml =~ m/(.*?)<\/DocumentSummary>/gs) { $geneId = $1; $match = $2; my %values = (); $values{ geneId } = $geneId; foreach my $tag ( @tags_to_find ) { while ( $match =~ m|<$tag>(.*?)|gs ) { my $val = $1; next if $val eq ""; if ( exists $values{ $tag } ) { $values{ $tag } .= "|$val"; # multiple values concatenated with pipes } else { $values{ $tag } = $val; # first value seen for this tag and gene so far } } } my $out = ""; my $val; foreach my $tag ( @tags_to_find ) { $val = $values{ $tag } || "-"; $out .= "$val\t"; } chop($out); # remove last tab print "$out\n"; } } ################################################################################ # sub extractAndOutputXml { my $xml = shift; my $first = shift; # Strip leading tags, # from all but first set # Strip closing tag from all if ($first) { my $start = 0; my $stop = index ($xml, $stopTag); print STDOUT substr ($xml, $start, $stop-$start); } else { my $start = index ($xml, $startTag) + length($startTag); my $stop = index ($xml, $stopTag); print STDOUT substr ($xml, $start, $stop-$start); } } ################################################################################ # # Subroutine to handle all eutil calls # # Create a BEGIN block to provide effect of static data for sub BEGIN { # static data my $lastEutilTime = 0; # init to avoid delay on first Eutil # storing local constants here too. my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"; my $minUtilPeriod = 3; # seconds sub Eutil { my ($eutil, $querystringData) = @_; my $elapsed = time() - $lastEutilTime; my $delay = $minUtilPeriod - $elapsed; if ($delay < 0) {$delay = 0} sleep ($delay); $lastEutilTime = time(); # save for next time my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData"; if ($verbose) {print STDOUT "\neutilUrl: $eutilUrl\n";} my $result = get($eutilUrl); # get result of the eutil for return if ((not defined $result) or ($result eq "")) { $result = ""; # Simplify error testing on return print STDERR "$eutil failed for request: $querystringData\n\n"; } if ($verbose) { print STDOUT "\neutil result: $result\n"; my $elapsedTime = time() - $lastEutilTime; print STDOUT "$eutil took $elapsedTime seconds\n"; } $result; # for return } } ################################################################################ # sub end { if ($verbose) { my $elapsedTime = time() - $beginTime; print STDOUT "\nElapsed time: $elapsedTime seconds\n"; } exit; }