File Coverage

blib/lib/Bio/ASN1/Sequence/Indexer.pm
Criterion Covered Total %
statement 13 15 86.7
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             Bio::ASN1::Sequence::Indexer - Indexes NCBI Sequence files.
4            
5             =head1 SYNOPSIS
6            
7             use Bio::ASN1::Sequence::Indexer;
8            
9             # creating & using the index is just a few lines
10             my $inx = Bio::ASN1::Sequence::Indexer->new(
11             -filename => 'seq.idx',
12             -write_flag => 'WRITE'); # needed for make_index call, but if opening
13             # existing index file, don't set write flag!
14             $inx->make_index('seq1.asn', 'seq2.asn');
15             my $seq = $inx->fetch('AF093062'); # Bio::Seq obj for Sequence (doesn't work yet)
16             # alternatively, if one prefers just a data structure instead of objects
17             $seq = $inx->fetch_hash('AF093062'); # a hash produced by Bio::ASN1::Sequence
18             # that contains all data in the Sequence record
19            
20             =head1 PREREQUISITE
21            
22             Bio::ASN1::Sequence, Bioperl and all dependencies therein.
23            
24             =head1 INSTALLATION
25            
26             Same as Bio::ASN1::EntrezGene
27            
28             =head1 DESCRIPTION
29            
30             Bio::ASN1::Sequence::Indexer is a Perl Indexer for NCBI Sequence genome
31             databases. It processes an ASN.1-formatted Sequence record and stores the
32             file position for each record in a way compliant with Bioperl standard (in
33             fact its a subclass of Bioperl's index objects).
34            
35             Note that this module does not parse record, because it needs to run fast and
36             grab only the gene ids. For parsing record, use Bio::ASN1::Sequence.
37            
38             As with Bio::ASN1::Sequence, this module is best thought of as beta version -
39             it works, but is not fully tested.
40            
41             =head1 SEE ALSO
42            
43             Please check out perldoc for Bio::ASN1::EntrezGene for more info.
44            
45             =head1 AUTHOR
46            
47             Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
48            
49             =head1 COPYRIGHT
50            
51             The Bio::ASN1::EntrezGene module and its related modules and scripts
52             are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
53             Institute. All rights reserved. I created these modules when working
54             on a collaboration project between these two companies. Therefore a
55             special thanks for the two companies to allow the release of the code
56             into public domain.
57            
58             You may use and distribute them under the terms of the Perl itself or
59             GPL (L<http://www.gnu.org/copyleft/gpl.html>).
60            
61             =head1 CITATION
62            
63             Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
64             Bioinformatics. In press
65            
66             =head1 OPERATION SYSTEMS SUPPORTED
67            
68             Any OS that Perl & Bioperl run on.
69            
70             =head1 METHODS
71            
72             =cut
73              
74             package Bio::ASN1::Sequence::Indexer;
75              
76 1     1   14 use strict;
  1         9  
  1         15  
77 1     1   44 use Carp qw(carp croak);
  1         9  
  1         18  
78 1     1   16 use vars qw ($VERSION @ISA);
  1         9  
  1         31  
79 1     1   16 use Bio::ASN1::Sequence;
  1         10  
  1         16  
80 1     1   13 use Bio::Index::AbstractSeq;
  0            
  0            
81              
82             @ISA = qw(Bio::Index::AbstractSeq);
83             $VERSION = '1.09';
84              
85             sub _version
86             {
87               return $VERSION;
88             }
89              
90             sub _type_stamp
91             {
92               return '__Sequence_ASN1__';
93             }
94              
95             sub _index_file
96             {
97               my($self, $file, $idx) = @_;
98               my $position;
99               open(IN, $file) || $self->throw("Can't open $file - $!");
100               local $/ = "Seq-entry ::= set {";
101               while(<IN>)
102               {
103                 chomp;
104                 while(/[,{}]\s+accession\s*"([^"]+)"\s+[,{}]/ig) # add both dna and protein
105                 {
106                   $self->add_record($1, $idx, $position);
107                 }
108                 $position = tell(IN) - 19; # $/'s length
109               }
110               close(IN);
111               return 1;
112             }
113              
114             sub _file_format
115             {
116               return 'sequence';
117             }
118              
119             =head2 fetch
120            
121             Parameters: $geneid - id for the Sequence record to be retrieved
122             Example: my $hash = $indexer->fetch(10); # get Sequence #10
123             Function: fetch the data for the given Sequence id.
124             Returns: A Bio::Seq object produced by Bio::SeqIO::sequence
125             Notes: Bio::SeqIO::sequence does not exist and probably won't
126             exist for a while! So call fetch_hash instead
127            
128             =cut
129              
130             =head2 fetch_hash
131            
132             Parameters: $seqid - id for the Sequence record to be retrieved
133             Example: my $hash = $indexer->fetch_hash('AF093062');
134             Function: fetch a hash produced by Bio::ASN1::Sequence for given id
135             Returns: A data structure containing all data items from the Sequence
136             record.
137             Notes: Alternative to fetch()
138            
139             =cut
140              
141             sub fetch_hash
142             {
143               my ($self, $seqid) = @_;
144               if (my $seq = $self->db->{$seqid})
145               {
146                 my ($fileno, $position) = $self->unpack_record($seq);
147                 my $parser = Bio::ASN1::Sequence->new('fh' => $self->_file_handle($fileno));
148                 seek($parser->fh, $position, 0);
149                 return $parser->next_seq;
150               }
151             }
152              
153             =head2 _file_handle
154            
155             Title : _file_handle
156             Usage : $fh = $index->_file_handle( INT )
157             Function: Returns an open filehandle for the file
158             index INT. On opening a new filehandle it
159             caches it in the @{$index->_filehandle} array.
160             If the requested filehandle is already open,
161             it simply returns it from the array.
162             Example : $fist_file_indexed = $index->_file_handle( 0 );
163             Returns : ref to a filehandle
164             Args : INT
165             Notes : This function is copied from Bio::Index::Abstract. Once that module
166             changes file handle code like I do below to fit perl 5.005_03, this
167             sub would be removed from this module
168            
169             =cut
170              
171             sub _file_handle {
172             my( $self, $i ) = @_;
173              
174             unless ($self->{'_filehandle'}[$i]) {
175             my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
176             or $self->throw("Can't get filename for index : $i");
177             my $file = $rec[0];
178             local *FH;
179             open *FH, $file or $self->throw("Can't read file '$file' : $!");
180             $self->{'_filehandle'}[$i] = *FH; # Cache filehandle
181             }
182             return $self->{'_filehandle'}[$i];
183             }
184              
185             1;
186              
187