File Coverage

blib/lib/Bio/ASN1/EntrezGene/Indexer.pm
Criterion Covered Total %
statement 13 15 86.7
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 20 90.0


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             Bio::ASN1::EntrezGene::Indexer - Indexes NCBI Entrez Gene files.
4            
5             =head1 SYNOPSIS
6            
7             use Bio::ASN1::EntrezGene::Indexer;
8            
9             # creating & using the index is just a few lines
10             my $inx = Bio::ASN1::EntrezGene::Indexer->new(
11             -filename => 'entrezgene.idx',
12             -write_flag => 'WRITE'); # needed for make_index call, but if opening
13             # existing index file, don't set write flag!
14             $inx->make_index('Homo_sapiens', 'Mus_musculus', 'Rattus_norvegicus');
15             my $seq = $inx->fetch(10); # Bio::Seq obj for Entrez Gene #10
16             # alternatively, if one prefers just a data structure instead of objects
17             $seq = $inx->fetch_hash(10); # a hash produced by Bio::ASN1::EntrezGene
18             # that contains all data in the Entrez Gene record
19            
20             # note that in case you wonder, you can get the files 'Homo_sapiens'
21             # from NCBI Entrez Gene ftp download, DATA/ASN/Mammalia directory
22            
23             =head1 PREREQUISITE
24            
25             Bio::ASN1::EntrezGene, Bioperl version that contains Stefan Kirov's
26             entrezgene.pm and all dependencies therein.
27            
28             =head1 INSTALLATION
29            
30             Same as Bio::ASN1::EntrezGene
31            
32             =head1 DESCRIPTION
33            
34             Bio::ASN1::EntrezGene::Indexer is a Perl Indexer for NCBI Entrez Gene genome
35             databases. It processes an ASN.1-formatted Entrez Gene record and stores the
36             file position for each record in a way compliant with Bioperl standard (in
37             fact its a subclass of Bioperl's index objects).
38            
39             Note that this module does not parse record, because it needs to run fast and
40             grab only the gene ids. For parsing record, use Bio::ASN1::EntrezGene, or
41             better yet, use Bio::SeqIO, format 'entrezgene'.
42            
43             It takes this module (version 1.07) 21 seconds to index the human genome
44             Entrez Gene file (Apr. 5/2005 download) on one 2.4 GHz Intel Xeon processor.
45            
46             =head1 SEE ALSO
47            
48             For details on various parsers I generated for Entrez Gene, example scripts that
49             uses/benchmarks the modules, please see L<http://sourceforge.net/projects/egparser/>.
50             Those other parsers etc. are included in V1.05 download.
51            
52             =head1 AUTHOR
53            
54             Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
55            
56             =head1 COPYRIGHT
57            
58             The Bio::ASN1::EntrezGene module and its related modules and scripts
59             are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
60             Institute. All rights reserved. I created these modules when working
61             on a collaboration project between these two companies. Therefore a
62             special thanks for the two companies to allow the release of the code
63             into public domain.
64            
65             You may use and distribute them under the terms of the Perl itself or
66             GPL (L<http://www.gnu.org/copyleft/gpl.html>).
67            
68             =head1 CITATION
69            
70             Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
71             Bioinformatics. In press
72            
73             =head1 OPERATION SYSTEMS SUPPORTED
74            
75             Any OS that Perl & Bioperl run on.
76            
77             =head1 METHODS
78            
79             =cut
80              
81             package Bio::ASN1::EntrezGene::Indexer;
82              
83 1     1   14 use strict;
  1         10  
  1         15  
84 1     1   14 use Carp qw(carp croak);
  1         9  
  1         19  
85 1     1   15 use vars qw ($VERSION @ISA);
  1         9  
  1         14  
86 1     1   21 use Bio::ASN1::EntrezGene;
  1         10  
  1         69  
87 1     1   17 use Bio::Index::AbstractSeq;
  0            
  0            
88              
89             @ISA = qw(Bio::Index::AbstractSeq);
90             $VERSION = '1.09';
91              
92             sub _version
93             {
94               return $VERSION;
95             }
96              
97             sub _type_stamp
98             {
99               return '__EntrezGene_ASN1__';
100             }
101              
102             sub _index_file
103             {
104               my($self, $file, $idx) = @_;
105               my $position;
106               open(IN, $file) || $self->throw("Can't open $file - $!");
107               local $/ = "Entrezgene ::= {";
108               while(<IN>)
109               {
110                 chomp;
111                 $self->add_record($1, $idx, $position) if (/[,{}]\s+geneid\s*(\d+)\s+[,{}]/i);
112                 $position = tell(IN) - 16; # $/'s length
113               }
114               close(IN);
115               return 1;
116             }
117              
118             sub _file_format
119             {
120               return 'entrezgene';
121             }
122              
123             =head2 fetch
124            
125             Parameters: $geneid - id for the Entrez Gene record to be retrieved
126             Example: my $hash = $indexer->fetch(10); # get Entrez Gene #10
127             Function: fetch the data for the given Entrez Gene id.
128             Returns: A Bio::Seq object produced by Bio::SeqIO::entrezgene
129             Notes: One needs to have Bio::SeqIO::entrezgene installed before
130             calling this function!
131            
132             =cut
133              
134             =head2 fetch_hash
135            
136             Parameters: $geneid - id for the Entrez Gene record to be retrieved
137             Example: my $hash = $indexer->fetch_hash(10); # get Entrez Gene #10
138             Function: fetch a hash produced by Bio::ASN1::EntrezGene for given Entrez
139             Gene id.
140             Returns: A data structure containing all data items from the Entrez
141             Gene record.
142             Notes: Alternative to fetch()
143            
144             =cut
145              
146             sub fetch_hash
147             {
148               my ($self, $geneid) = @_;
149               if (my $gene = $self->db->{$geneid})
150               {
151                 my ($fileno, $position) = $self->unpack_record($gene);
152                 my $parser = Bio::ASN1::EntrezGene->new('fh' => $self->_file_handle($fileno));
153                 seek($parser->fh, $position, 0);
154                 return $parser->next_seq;
155               }
156             }
157              
158             =head2 _file_handle
159            
160             Title : _file_handle
161             Usage : $fh = $index->_file_handle( INT )
162             Function: Returns an open filehandle for the file
163             index INT. On opening a new filehandle it
164             caches it in the @{$index->_filehandle} array.
165             If the requested filehandle is already open,
166             it simply returns it from the array.
167             Example : $fist_file_indexed = $index->_file_handle( 0 );
168             Returns : ref to a filehandle
169             Args : INT
170             Notes : This function is copied from Bio::Index::Abstract. Once that module
171             changes file handle code like I do below to fit perl 5.005_03, this
172             sub would be removed from this module
173            
174             =cut
175              
176             sub _file_handle {
177             my( $self, $i ) = @_;
178              
179             unless ($self->{'_filehandle'}[$i]) {
180             my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
181             or $self->throw("Can't get filename for index : $i");
182             my $file = $rec[0];
183             local *FH;
184             open *FH, $file or $self->throw("Can't read file '$file' : $!");
185             $self->{'_filehandle'}[$i] = *FH; # Cache filehandle
186             }
187             return $self->{'_filehandle'}[$i];
188             }
189              
190             1;
191              
192