#!/usr/bin/perl -w # This script uses the Bio::SearchIO module # to parse a BLAST output, extracts GI numbers # from the Hit descriptions, and retrieves the # sequences with those GI numbers from GenBank, # using the Bio::DB::GenBank module. It saves # these sequences as a FASTA format file. # Author: MyName, 03 Jun 2003 # Version 1 use Bio::SearchIO; use Bio::DB::GenBank; use Getopt::Long; my ($blastfil, $pattern, $SrchIO, $result, $sname), my ($sdesc, @part, @gi, $ngi, $hit, $hitlen); my ($hsp, $hspPct, $hspEval, $inSeqIO, $outSeqIO); my ($gb, $seqobj); GetOptions("file=s" => \$blastfil, "pat=s" => \$pattern); # Check to see if -file and -pat were specified if (!defined $blastfil || !defined $pattern) { die "Usage $0 -file BLASTfile -pat pattern\n"; } # Open an output file for writing Hit info open(OUTFILE, ">$blastfil.hit_info") or die "Cannot open $blastfil.hit_info for writing\n"; # Create a SearchIO object to get BLAST results $SrchIO = Bio::SearchIO->new('-format' => 'blast', '-file' => "$blastfil"); # Loop through each BLAST result while ($result = $SrchIO->next_result()) { $qname = $result->query_name(); # Loop through each hit for this query while ($hit = $result->next_hit()) { $sname = $hit->name(); $sdesc = $hit->description(); if ($sdesc !~ /$pattern/) { next; } @part = split /\|/, $sname; if ($part[0] ne 'gi') { print "No gi in $sname\n"; next; } push @gi, "$part[1]"; $hitlen = $hit->length(); # Get info for each HSP within the Hit while ($hsp = $hit->next_hsp) { $strand = $hsp->hit->strand; $hspPct = $hsp->frac_identical(); $hspEval = $hsp->evalue(); print OUTFILE "$qname\t$sname\t"; print OUTFILE "$sdesc\t$hitlen\t"; print OUTFILE "$hspPct\t$strand\t"; print OUTFILE "$hspEval\n"; } # End of while next HSP } # End of while next hit } # End of while next result close OUTFILE; # Create a SeqIO object to write a FASTA file $outSeqIO = Bio::SeqIO->new('-format' => 'fasta', '-file' => ">$blastfil.fasta"); # Create a GenBank object that will give us a # SeqIO object to use for input $gb = Bio::DB::GenBank->new('-format' => 'fasta', '-retrievaltype' => 'tempfile'); $ngi = @gi; if ($ngi > 0) { print "Retrieving $ngi seqs from GenBank... \n"; $inSeqIO = $gb->get_Stream_by_id(\@gi); while ($seqobj = $inSeqIO->next_seq()) { $outSeqIO->write_seq($seqobj); } } else { print "No subjects matching $pattern found\n"; }