File Coverage

blib/lib/Bio/ASN1/EntrezGene.pm
Criterion Covered Total %
statement 113 126 89.7
branch 55 76 72.4
condition 22 45 48.9
subroutine 11 12 91.7
pod 8 8 100.0
total 209 267 78.3


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             Bio::ASN1::EntrezGene - Regular expression-based Perl Parser for NCBI Entrez Gene.
4            
5             =head1 SYNOPSIS
6            
7             use Bio::ASN1::EntrezGene;
8            
9             my $parser = Bio::ASN1::EntrezGene->new('file' => "Homo_sapiens");
10             while(my $result = $parser->next_seq)
11             {
12             # extract data from $result, or Dumpvalue->new->dumpValue($result);
13             }
14            
15             # a new way to get the $result data hash for a particular gene id:
16             use Bio::ASN1::EntrezGene::Indexer;
17             my $inx = Bio::ASN1::EntrezGene::Indexer->new(-filename => 'entrezgene.idx');
18             my $seq = $inx->fetch_hash(10); # returns $result for Entrez Gene record
19             # with geneid 10
20             # note that the index file 'entrezgene.idx' can be created as follows
21             my $inx = Bio::ASN1::EntrezGene::Indexer->new(
22             -filename => 'entrezgene.idx',
23             -write_flag => 'WRITE');
24             $inx->make_index('Homo_sapiens', 'Mus_musculus'); # files come from NCBI download
25            
26             # for more detail please refer to Bio::ASN1::EntrezGene::Indexer perldoc
27            
28             =head1 PREREQUISITE
29            
30             None.
31            
32             =head1 INSTALLATION
33            
34             Bio::ASN1::EntrezGene package can be installed & tested as follows:
35            
36             perl Makefile.PL
37             make
38             make test
39             make install
40            
41             =head1 DESCRIPTION
42            
43             Bio::ASN1::EntrezGene is a regular expression-based Perl Parser for NCBI Entrez
44             Gene genome databases (L<http://www.ncbi.nih.gov/entrez/query.fcgi?db=gene>). It
45             parses an ASN.1-formatted Entrez Gene record and returns a data structure that
46             contains all data items from the gene record.
47            
48             The parser will report error & line number if input data does not conform to the
49             NCBI Entrez Gene genome annotation file format.
50            
51             Note that it is possible to provide reading of all NCBI's ASN.1-formatted
52             files through simple variations of the Entrez Gene parser (I need more
53             investigation to be sure, but at least the sequence parser is a very simple
54             variation on Entrez Gene parser and works well).
55            
56             It took the parser version 1.0 11 minutes to parse the human genome Entrez Gene
57             file on one 2.4 GHz Intel Xeon processor. The addition of validation and error
58             reporting in 1.03 and handling of new Entrez Gene format slowed the parser down
59             about 40%.
60            
61             Since V1.07, this package also included an indexer that runs pretty fast (it
62             takes 21 seconds for the indexer to index the human genome on the same
63             processor). Therefore the combination of the modules would allow user to
64             retrieve and parse arbitrary records.
65            
66             =head1 SEE ALSO
67            
68             The parse_entrez_gene_example.pl script included in this package (please
69             see the Bio-ASN1-EntrezGene-x.xx/examples directory) is a very
70             important and near-complete demo on using this module to extract all data
71             items from Entrez Gene records. Do check it out because in fact, this
72             script took me about 3-4 times more time to make for my project than the
73             parser V1.0 itself. Note that the example script was edited to leave
74             out stuff specific to my internal project.
75            
76             For details on various parsers I generated for Entrez Gene, example scripts that
77             uses/benchmarks the modules, please see L<http://sourceforge.net/projects/egparser/>.
78             Those other parsers etc. are included in V1.05 download.
79            
80             =head1 AUTHOR
81            
82             Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
83            
84             =head1 COPYRIGHT
85            
86             The Bio::ASN1::EntrezGene module and its related modules and scripts
87             are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
88             Institute. All rights reserved. I created these modules when working
89             on a collaboration project between these two companies. Therefore a
90             special thanks for the two companies to allow the release of the code
91             into public domain.
92            
93             You may use and distribute them under the terms of the Perl itself or
94             GPL (L<http://www.gnu.org/copyleft/gpl.html>).
95            
96             =head1 CITATION
97            
98             Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
99             Bioinformatics. In press
100            
101             =head1 OPERATION SYSTEMS SUPPORTED
102            
103             Any OS that Perl runs on.
104            
105             =head1 METHODS
106            
107             =cut
108              
109             package Bio::ASN1::EntrezGene;
110              
111 2     2   27 use strict;
  2         18  
  2         30  
112 2     2   29 use Carp qw(carp croak);
  2         20  
  2         39  
113 2     2   30 use vars qw ($VERSION);
  2         18  
  2         58  
114              
115             $VERSION = '1.09';
116              
117             =head2 new
118            
119             Parameters: maxerrstr => 20 (optional) - maximum number of characters after
120             offending element, used by error reporting, default is 20
121             file or -file => $filename (optional) - name of the file to be
122             parsed. call next_seq to parse!
123             fh or -fh => $filehandle (optional) - handle of the file to be
124             parsed.
125             Example: my $parser = Bio::ASN1::EntrezGene->new();
126             Function: Instantiate a parser object
127             Returns: Object reference
128             Notes: Setting file or fh will reset line numbers etc. that are used
129             for error reporting purposes, and seeking on file handle would
130             mess up linenumbers!
131            
132             =cut
133              
134             sub new
135             {
136 1     1 1 11   my $class = shift;
137 1 50       13   $class = ref($class) if(ref($class));
138 1         77   my $self = { maxerrstr => 20, @_ };
139 1         12   bless $self, $class;
140 1 100       11   map { $self->input_file($self->{$_}) if($self->{$_}) } qw(file -file);
  2         28  
141 1 100       10   map { $self->fh($self->{$_}) if($self->{$_}) } qw(fh -fh);
  2         38  
142 1         13   return $self;
143             }
144              
145             =head2 maxerrstr
146            
147             Parameters: $maxerrstr (optional) - maximum number of characters after
148             offending element, used by error reporting, default is 20
149             Example: $parser->maxerrstr(20);
150             Function: get/set maxerrstr.
151             Returns: maxerrstr.
152             Notes:
153            
154             =cut
155              
156             sub maxerrstr
157             {
158 0     0 1 0   my ($self, $value) = @_;
159 0 0       0   $self->{maxerrstr} = $value if $value > 0;
160 0         0   return $self->{maxerrstr};
161             }
162              
163              
164             =head2 parse
165            
166             Parameters: $string that contains Entrez Gene record,
167             $trimopt (optional) that specifies how the data structure
168             returned should be trimmed. 2 is recommended and
169             default
170             $noreset (optional) that species that line number should not
171             be reset
172             DEPRECATED as external function!!! Do not call this function
173             directly! Call next_seq() instead
174             Example: my $value = $parser->parse($text); # DEPRECATED as
175             # external function!!! Do not call this function
176             # directly! Call next_seq() instead
177             Function: Takes in a string representing Entrez Gene record, parses
178             the record and returns a data structure.
179             Returns: A data structure containing all data items from the Entrez
180             Gene record.
181             Notes: DEPRECATED as external function!!! Do not call this function
182             directly! Call next_seq() instead
183             $string should not contain 'EntrezGene ::=' at beginning!
184            
185             =cut
186              
187             sub parse
188             {
189 1     1 1 251   my ($self, $input, $compact, $noreset) = @_;
190 1 50       15   $input || croak "must have input!\n";
191 1         190   $self->{input} = $input;
192 1 50       14   $self->{filename} = "input" unless $self->{filename};
193 1 50 33     16   $self->{linenumber} = 1 unless $self->{linenumber} && $noreset;
194 1         11   $self->{depth} = 0;
195 1         10   my $result;
196               eval
197 1         38   {
198 1         14     $result = $self->_parse(); # no need to reset $self->{depth} or linenumber
199               };
200 1 50       12   if($@)
201               {
202 0 0       0     if($@ !~ /^Data Error:/)
203                 {
204 0         0       croak "non-conforming data broke parser on line $self->{linenumber} in $self->{filename}\n".
205                         "possible cause includes randomly inserted brackets in input file before line $self->{linenumber}\n".
206                         "first $self->{maxerrstr} (or till end of input) characters including the non-conforming data:\n" .
207                         substr($self->{input}, pos($self->{input}), $self->{maxerrstr}) . "\nRaw error mesg: $@\n";
208                 }
209 0         0     else { die $@ }
210               }
211 1         13   trimdata($result, $compact);
212 1         20   return $result;
213             }
214              
215             =head2 input_file
216            
217             Parameters: $filename for file that contains Entrez Gene record(s)
218             Example: $parser->input_file($filename);
219             Function: Takes in name of a file containing Entrez Gene records.
220             opens the file and stores file handle
221             Returns: none.
222             Notes: Attemps to open file larger than 2 GB even on Perl that
223             does not support 2 GB file (accomplished by calling
224             "cat" and piping output. On OS that does not have "cat"
225             error message will be displayed)
226            
227             =cut
228              
229             sub input_file
230             {
231 1     1 1 12   my ($self, $filename) = @_;
232             # in case user's Perl system can't handle large file. Assuming Unix, otherwise raise error
233 1         10   local *IN; # older styled code to enable module to work with perl 5.005_03
234 1 50 0     79   open(*IN, $filename) ||
      33        
235               ($! =~ /too large/i && open(*IN, "cat $filename |")) ||
236                 croak "can't open $filename! -- $!\n";
237 1         13   $self->{fh} = *IN;
238 1         12   $self->{filename} = $filename;
239 1         15   $self->{linenumber} = 0; # reset line number
240             }
241              
242             =head2 next_seq
243            
244             Parameters: $trimopt (optional) that specifies how the data structure
245             returned should be trimmed. option 2 is recommended and
246             default
247             Example: my $value = $parser->next_seq();
248             Function: Use the file handle generated by input_file, parses the next
249             the record and returns a data structure.
250             Returns: A data structure containing all data items from the Entrez
251             Gene record.
252             Notes: Must pass in a filename through new() or input_file() first!
253             For details on how to use the $trimopt data trimming option
254             please see comment for the trimdata method. An option
255             of 2 is recommended and default
256             The acceptable values for $trimopt include:
257             1 - trim as much as possibile
258             2 (or 0, undef) - trim to an easy-to-use structure
259             3 - no trimming (in version 1.06, prior to version
260             1.06, 0 or undef means no trimming)
261            
262             =cut
263              
264             sub next_seq
265             {
266 1     1 1 10   my ($self, $compact) = @_;
267 1 50       20   $self->{fh} || croak "you must pass in a file name or handle through new() or input_file() first before calling next_seq!\n";
268 1         14   local $/ = "Entrezgene ::= {"; # set record separator
269 1         138   while($_ = readline($self->{fh}))
270               {
271 2         25     chomp;
272 2 100       546     next unless /\S/;
273 1 50       429     my $tmp = (/^\s*Entrezgene ::= ({.*)/si)? $1 : "{" . $_; # get rid of the 'Entrezgene ::= ' at the beginning of Entrez Gene record
274 1         16     return $self->parse($tmp, $compact, 1); # 1 species no resetting line number
275               }
276             }
277              
278             # NCBI's Apr 05, 2005 format change forced much usage of lookahead, which would for
279             # sure slows parser down. But can't code efficiently without it.
280             sub _parse
281             {
282 455     455   7525   my ($self, $flag) = @_;
283 455         3787   my $data;
284 455         4474   while(1)
285               {
286             # changing orders of regex if/elsif statements made little difference. current order is close to optimal
287 2413 100       45948     if($self->{input} =~ /\G[ \t]*,?[ \t]*\n/cg) # cleanup leftover
288                 {
289 979         9456       $self->{linenumber}++;
290 979         9522       next;
291                 }
292 1434 100       28339     if($self->{input} =~ /\G[ \t]*}/cg)
    100          
    100          
    100          
293                 {
294 337 50 33     67032       if(!($self->{depth}--) && $self->{input} =~ /\S/)
295                   {
296 0         0         croak "Data Error: extra (mismatched) '}' found on line $self->{linenumber} in $self->{filename}!\n";
297                   }
298 337         6193       return $data
299                 }
300                 elsif($self->{input} =~ /\G[ \t]*{/cg)
301                 {
302 140         1161       $self->{depth}++;
303 140         1710       push(@$data, $self->_parse())
304                 }
305                 elsif($self->{input} =~ /\G[ \t]*([\w-]+)(\s*)/cg)
306                 {
307 947         24291       my ($id, $lines) = ($1, $2);
308             # we're prepared for NCBI to make the format even worse:
309             # note: to count line numbers right for text files on different OS, I'm sacrificing much speed (maybe I shouldn't worry so much)
310 947   66     16878       $self->{linenumber} += $lines =~ s/\n//g || $lines =~ s/\r//g; # count by *NIX/Win or Mac
311 947         12178       my $tmp;
312             # we put \s* in lookahead for linenumber counting purpose (which slows things down)
313 947 100 66     23377       if(($self->{input} =~ /\G"((?:[^"]|"")*)"(?=\s*[,}])/cg && ++$tmp) ||
    100 100        
    50          
314                      $self->{input} =~ /\G([\w-]+)(?=\s*[,}])/cg)
315                   {
316 633         6959         my $value = $1;
317 633 100       8957         if($tmp) # slight speed optimization, not really necessary since regex is fast enough
318                     {
319 202         3878           $value =~ s/""/"/g;
320 202   66     4235           $self->{linenumber} += $value =~ s/\n//g || $value =~ s/\r//g; # count by *NIX/Win or Mac
321 202         2818           $value =~ s/[\r\n]+//g; # in case it's Win format
322                     }
323 633 100       16590         if(ref($data->{$id})) { push(@{$data->{$id}}, $value) } # hash value is not a terminal (or have multiple values), create array to avoid multiple same-keyed hash overwrite each other
  5 100       42  
  5         52  
324 1         14         elsif($data->{$id}) { $data->{$id} = [$data->{$id}, $value] } # hash value has a second terminal value now!
325 627         8470         else { $data->{$id} = $value } # the first terminal value
326                   }
327                   elsif($self->{input} =~ /\G{/cg)
328                   {
329 197         1914         $self->{depth}++;
330 197         1624         push(@{$data->{$id}}, $self->_parse());
  197         5875  
331                   }
332 0         0       elsif($self->{input} =~ /\G(?=[,}])/cg) { push(@$data, $id) }
333                   else # must be "id value value" format
334                   {
335 117         1116         $self->{depth}++;
336 117         947         push(@{$data->{$id}}, $self->_parse(1))
  117         3348  
337                   }
338 947 100       13308       if($flag)
339                   {
340 117 50 33     2188         if(!($self->{depth}--) && $self->{input} =~ /\S/)
341                     {
342 0         0           croak "Data Error: extra (mismatched) '}' found on line $self->{linenumber} in $self->{filename}!\n";
343                     }
344 117         4168         return $data;
345                   }
346                 }
347                 elsif($self->{input} =~ /\G[ \t]*"((?:[^"]|"")*)"(?=\s*[,}])/cg)
348                 {
349 9         88       my $value = $1;
350 9         79       $value =~ s/""/"/g;
351 9   33     112       $self->{linenumber} += $value =~ s/\n//g || $value =~ s/\r//g; # count by *NIX/Win or Mac
352 9         76       $value =~ s/[\r\n]+//g; # in case it's Win format
353 9         87       push(@$data, $value)