File Coverage

blib/lib/Bio/ASN1/Sequence.pm
Criterion Covered Total %
statement 113 129 87.6
branch 55 78 70.5
condition 28 54 51.9
subroutine 11 12 91.7
pod 8 8 100.0
total 215 281 76.5


line stmt bran cond sub pod time code
1             =head1 NAME
2            
3             Bio::ASN1::Sequence - Regular expression-based Perl Parser for ASN.1-formatted NCBI Sequences.
4            
5             =head1 SYNOPSIS
6            
7             use Bio::ASN1::Sequence;
8            
9             my $parser = Bio::ASN1::Sequence->new('file' => "downloaded.asn1");
10             while(my $result = $parser->next_seq)
11             {
12             # extract data from $result, or Dumpvalue->new->dumpValue($result);
13             }
14            
15             # a new way to get the $result data hash for a particular sequence id:
16             use Bio::ASN1::Sequence::Indexer;
17             my $inx = Bio::ASN1::Sequence::Indexer->new(-filename => 'seq.idx');
18             my $seq = $inx->fetch_hash('AF093062');
19            
20             # for creation of .idx index files please refer to
21             # Bio::ASN1::Sequence::Indexer perldoc
22            
23             =head1 PREREQUISITE
24            
25             None.
26            
27             =head1 INSTALLATION
28            
29             Bio::ASN1::Sequence is part of the Bio::ASN1::EntrezGene package.
30             Bio::ASN1::EntrezGene package can be installed & tested as follows:
31            
32             perl Makefile.PL
33             make
34             make test
35             make install
36            
37             =head1 DESCRIPTION
38            
39             Bio::ASN1::Sequence is a regular expression-based Perl Parser for ASN.1-formatted
40             NCBI sequences. It parses an ASN.1-formatted sequence record and returns a data
41             structure that contains all data items from the sequence record.
42            
43             The parser will report error & line number if input data does not conform to the
44             NCBI Sequence annotation file format.
45            
46             The sequence parser is basically a modified version of the high-performance
47             Bio::ASN1::EntrezGene parser. However, I created a standalone module for sequence
48             since it is more efficient to keep Sequence-specific code out of EntrezGene.pm.
49            
50             In fact it is possible to provide reading of all NCBI's ASN.1-formatted
51             files through simple variations of the Entrez Gene parser (I need more
52             investigation to be sure, but at least the sequence parser works well).
53            
54             Since demand for parsing NCBI ASN.1-formatted sequences is much lower than EntrezGene,
55             this module is more like a beta version that works on the examples I checked, but
56             I did not check all available records or data definitions. The error-reporting
57             function of this module has to be useful sometimes. :)
58            
59             =head1 SEE ALSO
60            
61             The parse_sequence_example.pl script included in this package (please
62             see the Bio-ASN1-EntrezGene-x.xx/examples directory) shows the usage.
63            
64             Please check out perldoc for Bio::ASN1::EntrezGene for more info.
65            
66             =head1 AUTHOR
67            
68             Dr. Mingyi Liu <mingyi.liu@gpc-biotech.com>
69            
70             =head1 COPYRIGHT
71            
72             The Bio::ASN1::EntrezGene module and its related modules and scripts
73             are copyright (c) 2005 Mingyi Liu, GPC Biotech AG and Altana Research
74             Institute. All rights reserved. I created these modules when working
75             on a collaboration project between these two companies. Therefore a
76             special thanks for the two companies to allow the release of the code
77             into public domain.
78            
79             You may use and distribute them under the terms of the Perl itself or
80             GPL (L<http://www.gnu.org/copyleft/gpl.html>).
81            
82             =head1 CITATION
83            
84             Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
85             Bioinformatics. In press
86            
87             =head1 OPERATION SYSTEMS SUPPORTED
88            
89             Any OS that Perl runs on.
90            
91             =head1 METHODS
92            
93             =cut
94              
95             package Bio::ASN1::Sequence;
96              
97 2     2   28 use strict;
  2         18  
  2         65  
98 2     2   45 use Carp qw(carp croak);
  2         19  
  2         37  
99 2     2   34 use vars qw ($VERSION);
  2         18  
  2         59  
100              
101             $VERSION = '1.09';
102              
103             =head2 new
104            
105             Parameters: maxerrstr => 20 (optional) - maximum number of characters after
106             offending element, used by error reporting, default is 20
107             file or -file => $filename (optional) - name of the file to be
108             parsed. call next_seq to parse!
109             fh or -fh => $filehandle (optional) - handle of the file to be
110             parsed.
111             Example: my $parser = Bio::ASN1::Sequence->new();
112             Function: Instantiate a parser object
113             Returns: Object reference
114             Notes: Setting file or fh will reset line numbers etc. that are used
115             for error reporting purposes, and seeking on file handle would
116             mess up linenumbers!
117            
118             =cut
119              
120             sub new
121             {
122 1     1 1 10   my $class = shift;
123 1 50       13   $class = ref($class) if(ref($class));
124 1         14   my $self = { maxerrstr => 20, @_ };
125 1         11   bless $self, $class;
126 1 100       11   map { $self->input_file($self->{$_}) if($self->{$_}) } qw(file -file);
  2         29  
127 1 100       11   map { $self->fh($self->{$_}) if($self->{$_}) } qw(fh -fh);
  2         34  
128 1         13   return $self;
129             }
130              
131             =head2 maxerrstr
132            
133             Parameters: $maxerrstr (optional) - maximum number of characters after
134             offending element, used by error reporting, default is 20
135             Example: $parser->maxerrstr(20);
136             Function: get/set maxerrstr.
137             Returns: maxerrstr.
138             Notes:
139            
140             =cut
141              
142             sub maxerrstr
143             {
144 0     0 1 0   my ($self, $value) = @_;
145 0 0       0   $self->{maxerrstr} = $value if $value > 0;
146 0         0   return $self->{maxerrstr};
147             }
148              
149              
150             =head2 parse
151            
152             Parameters: $string that contains Sequence record,
153             $trimopt (optional) that specifies how the data structure
154             returned should be trimmed. 2 is recommended and
155             default
156             $noreset (optional) that species that line number should not
157             be reset
158             DEPRECATED as external function!!! Do not call this function
159             directly! Call next_seq() instead
160             Example: my $value = $parser->parse($text); # DEPRECATED as
161             # external function!!! Do not call this function
162             # directly! Call next_seq() instead
163             Function: Takes in a string representing Sequence record, parses
164             the record and returns a data structure.
165             Returns: A data structure containing all data items from the sequence
166             record.
167             Notes: DEPRECATED as external function!!! Do not call this function
168             directly! Call next_seq() instead
169             $string should not contain 'Seq-entry ::= set' at beginning!
170            
171             =cut
172              
173             sub parse
174             {
175 1     1 1 98   my ($self, $input, $compact, $noreset) = @_;
176 1 50       12   $input || croak "must have input!\n";
177 1         90   $self->{input} = $input;
178 1 50       13   $self->{filename} = "input" unless $self->{filename};
179 1 50 33     16   $self->{linenumber} = 1 unless $self->{linenumber} && $noreset;
180 1         10   $self->{depth} = 0;
181 1         9   my $result;
182               eval
183 1         10   {
184 1         13     $result = $self->_parse(); # no need to reset $self->{depth} or linenumber
185               };
186 1 50       11   if($@)
187               {
188 0 0       0     if($@ !~ /^Data Error:/)
189                 {
190 0         0       croak "non-conforming data broke parser on line $self->{linenumber} in $self->{filename}\n".
191                         "possible cause includes randomly inserted brackets in input file before line $self->{linenumber}\n".
192                         "first $self->{maxerrstr} (or till end of input) characters including the non-conforming data:\n" .
193                         substr($self->{input}, pos($self->{input}), $self->{maxerrstr}) . "\nRaw error mesg: $@\n";
194                 }
195 0         0     else { die $@ }
196               }
197 1         12   trimdata($result, $compact);
198 1         20   return $result;
199             }
200              
201             =head2 input_file
202            
203             Parameters: $filename for file that contains Sequence record(s)
204             Example: $parser->input_file($filename);
205             Function: Takes in name of a file containing Sequence records.
206             opens the file and stores file handle
207             Returns: none.
208             Notes: Attemps to open file larger than 2 GB even on Perl that
209             does not support 2 GB file (accomplished by calling
210             "cat" and piping output. On OS that does not have "cat"
211             error message will be displayed)
212            
213             =cut
214              
215             sub input_file
216             {
217 1     1 1 12   my ($self, $filename) = @_;
218             # in case user's Perl system can't handle large file. Assuming Unix, otherwise raise error
219 1         11   local *IN; # older styled code to enable module to work with perl 5.005_03
220 1 50 0     74   open(*IN, $filename) ||
      33        
221               ($! =~ /too large/i && open(*IN, "cat $filename |")) ||
222                 croak "can't open $filename! -- $!\n";
223 1         13   $self->{fh} = *IN;
224 1         11   $self->{filename} = $filename;
225 1         16   $self->{linenumber} = 0; # reset line number
226             }
227              
228             =head2 next_seq
229            
230             Parameters: $trimopt (optional) that specifies how the data structure
231             returned should be trimmed. option 2 is recommended and
232             default
233             Example: my $value = $parser->next_seq();
234             Function: Use the file handle generated by input_file, parses the next
235             the record and returns a data structure.
236             Returns: A data structure containing all data items from the sequence
237             record.
238             Notes: Must pass in a filename through new() or input_file() first!
239             For details on how to use the $trimopt data trimming option
240             please see comment for the trimdata method. An option
241             of 2 is recommended and default
242             The acceptable values for $trimopt include:
243             1 - trim as much as possibile
244             2 (or 0, undef) - trim to an easy-to-use structure
245             3 - no trimming (in version 1.06, prior to version
246             1.06, 0 or undef means no trimming)
247            
248             =cut
249              
250             sub next_seq
251             {
252 1     1 1 14   my ($self, $compact) = @_;
253 1 50       26   $self->{fh} || croak "you must pass in a file name or handle through new() or input_file() first before calling next_seq!\n";
254 1         17   local $/ = "Seq-entry ::= set {"; # set record separator
255 1         163   while($_ = readline($self->{fh}))
256               {
257 2         21     chomp;
258 2 100       141     next unless /\S/;
259 1 50       34     my $tmp = (/^\s*Seq-entry ::= set ({.*)/si)? $1 : "{" . $_; # get rid of the 'Seq-entry ::= set ' at the beginning of Sequence record
260 1         13     return $self->parse($tmp, $compact, 1); # 1 species no resetting line number
261               }
262             }
263              
264             # NCBI's Apr 05, 2005 format change forced much usage of lookahead, which would for
265             # sure slows parser down. But can't code efficiently without it.
266             sub _parse
267             {
268 147     147   1389   my ($self, $flag) = @_;
269 147         1293   my $data;
270 147         1157   while(1)
271               {
272             # changing orders of regex if/elsif statements made little difference. current order is close to optimal
273 622 100       8508     if($self->{input} =~ /\G[ \t]*,?[ \t]*\n/cg) # cleanup leftover
274                 {
275 238         2393       $self->{linenumber}++;
276 238         2062       next;
277                 }
278 384 100       13666     if($self->{input} =~ /\G[ \t]*}/cg)
    100          
    100          
    100          
279                 {
280 104 50 33     1070       if(!($self->{depth}--) && $self->{input} =~ /\S/)
281                   {
282 0         0         croak "Data Error: extra (mismatched) '}' found on line $self->{linenumber} in $self->{filename}!\n";
283                   }
284 104         1131       return $data
285                 }
286                 elsif($self->{input} =~ /\G[ \t]*{/cg)
287                 {
288 20         167       $self->{depth}++;
289 20         219       push(@$data, $self->_parse())
290                 }
291                 elsif($self->{input} =~ /\G[ \t]*([\w-]+)(\s*)/cg)
292                 {
293 258         2803       my ($id, $lines) = ($1, $2);
294             # we're prepared for NCBI to make the format even worse:
295             # note: to count line numbers right for text files on different OS, I'm sacrificing much speed (maybe I shouldn't worry so much)
296 258   66     3771       $self->{linenumber} += $lines =~ s/\n//g || $lines =~ s/\r//g; # count by *NIX/Win or Mac
297 258         2147       my ($tmp, $tmp1);
298             # we put \s* in lookahead for linenumber counting purpose (which slows things down)
299 258 100 66     7706       if(($self->{input} =~ /\G"((?:[^"]|"")*)"(?=\s*[,}])/cg && ++$tmp) ||
    100 66        
    50 100        
      100        
300                      ($self->{input} =~ /\G'([^']+)'\s*H/icg && ++$tmp1) || # this is the only difference b/w sequence and entrez gene formats so far
301                      $self->{input} =~ /\G([\w-]+)(?=\s*[,}])/cg)
302                   {
303 132         1393         my $value = $1;
304 132 100       1249         if($tmp) # slight speed optimization, not really necessary since regex is fast enough
    100          
305                     {
306 66         693           $value =~ s/""/"/g;
307 66   66     861           $self->{linenumber} += $value =~ s/\n//g || $value =~ s/\r//g; # count by *NIX/Win or Mac
308 66         591           $value =~ s/[\r\n]+//g; # in case it's Win format
309                     }
310                     elsif($tmp1) # slight speed optimization, not really necessary since regex is fast enough
311                     {
312 1         93           $value =~ tr/fF8421/NNTGCA/; # good for NCBI4na. But if NCBI8na was used, then more needs to be transliterated
313 1   33     33           $self->{linenumber} += $value =~ s/\n//g || $value =~ s/\r//g; # count by *NIX/Win or Mac
314 1         25           $value =~ s/[\r\n0]+//g; # in case it's Win format (get rid of '0' at end of seq too)
315                     }
316 132 50       2483         if(ref($data->{$id})) { push(@{$data->{$id}}, $value) } # hash value is not a terminal (or have multiple values), create array to avoid multiple same-keyed hash overwrite each other
  0 50       0  
  0         0  
317 0         0         elsif($data->{$id}) { $data->{$id} = [$data->{$id}, $value] } # hash value has a second terminal value now!
318 132         1437         else { $data->{$id} = $value } # the first terminal value
319                   }
320                   elsif($self->{input} =~ /\G{/cg)
321                   {
322 84         910         $self->{depth}++;
323 84         1015         push(@{$data->{$id}}, $self->_parse());
  84         1339  
324                   }
325 0         0       elsif($self->{input} =~ /\G(?=[,}])/cg) { push(@$data, $id) }
326                   else # must be "id value value" format
327                   {
328 42         380         $self->{depth}++;
329 42         335         push(@{$data->{$id}}, $self->_parse(1))
  42         610  
330                   }
331 258 100       2572       if($flag)
332                   {
333 42 50 33     2946         if(!($self->{depth}--) && $self->{input} =~ /\S/)
334                     {
335 0         0           croak "Data Error: extra (mismatched) '}' found on line $self->{linenumber} in $self->{filename}!\n";
336                     }
337 42         476         return $data;
338                   }
339                 }
340                 elsif($self->{input} =~ /\G[ \t]*"((?:[^"]|"")*)"(?=\s*[,}])/cg)
341                 {
342 1         13       my $value = $1;
343 1         10       $value =~ s/""/"/g;
344 1   33     17       $self->{linenumber} += $value =~ s/\n//g || $value =~ s/\r//g; # count by *NIX/Win or Mac
345 1         11       $value =~ s/[\r\n]+//g; # in case it's Win format
346 1         13       push