Class Bio::FlatFile::AutoDetect
In: lib/bio/io/flatfile/autodetection.rb
Parent: Object

AutoDetect automatically determines database class of given data.

Methods

Included Modules

TSort

Classes and Modules

Class Bio::FlatFile::AutoDetect::RuleDebug
Class Bio::FlatFile::AutoDetect::RuleProc
Class Bio::FlatFile::AutoDetect::RuleRegexp
Class Bio::FlatFile::AutoDetect::RuleRegexp2
Class Bio::FlatFile::AutoDetect::RuleSpecial
Class Bio::FlatFile::AutoDetect::RuleTemplate
Class Bio::FlatFile::AutoDetect::RulesArray

Constants

TopRule = RuleSpecial.new('top')   Special element that is always top priority.
BottomRule = RuleSpecial.new('bottom')   Special element that is always bottom priority.

Public Class methods

make a new autodetect object

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 361
361:       def self.[](*arg)
362:         a = self.new
363:         arg.each { |e| a.add(e) }
364:         a
365:       end

returns the default autodetect object

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 348
348:       def self.default
349:         unless @default then
350:           @default = self.make_default
351:         end
352:         @default
353:       end

sets the default autodetect object.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 356
356:       def self.default=(ad)
357:         @default = ad
358:       end

make a default of default autodetect object

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 368
368:       def self.make_default
369:         a = self[
370:           genbank  = RuleRegexp[ 'Bio::GenBank',
371:             /^LOCUS       .+ bp .*[a-z]*[DR]?NA/ ],
372:           genpept  = RuleRegexp[ 'Bio::GenPept',
373:             /^LOCUS       .+ aa .+/ ],
374:           medline  = RuleRegexp[ 'Bio::MEDLINE',
375:             /^PMID\- [0-9]+$/ ],
376:           embl     = RuleRegexp[ 'Bio::EMBL',
377:             /^ID   .+\; .*(DNA|RNA|XXX)\;/ ],
378:           sptr     = RuleRegexp2[ 'Bio::SPTR',
379:             /^ID   .+\; *PRT\;/,
380:             /^ID   [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
381:           prosite  = RuleRegexp[ 'Bio::PROSITE',
382:             /^ID   [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
383:           transfac = RuleRegexp[ 'Bio::TRANSFAC',
384:             /^AC  [-A-Za-z0-9_\.]+$/ ],
385: 
386:           aaindex  = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
387:             if /^H [-A-Z0-9_\.]+$/ =~ text then
388:               if text =~ /^M [rc]/ then
389:                 Bio::AAindex2
390:               elsif text =~ /^I    A\/L/ then
391:                 Bio::AAindex1
392:               else
393:                 false #fail to determine
394:               end
395:             else
396:               nil
397:             end
398:           end,
399: 
400:           litdb    = RuleRegexp[ 'Bio::LITDB',
401:             /^CODE        [0-9]+$/ ],
402:           brite    = RuleRegexp[ 'Bio::KEGG::BRITE',
403:             /^Entry           [A-Z0-9]+/ ],
404:           orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
405:             /^ENTRY       .+ KO\s*/ ],
406:           drug     = RuleRegexp[ 'Bio::KEGG::DRUG',
407:             /^ENTRY       .+ Drug\s*/ ],
408:           glycan   = RuleRegexp[ 'Bio::KEGG::GLYCAN',
409:             /^ENTRY       .+ Glycan\s*/ ],
410:           enzyme   = RuleRegexp2[ 'Bio::KEGG::ENZYME',
411:             /^ENTRY       EC [0-9\.]+$/,
412:             /^ENTRY       .+ Enzyme\s*/
413:           ],
414:           compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
415:             /^ENTRY       C[A-Za-z0-9\._]+$/,
416:             /^ENTRY       .+ Compound\s*/
417:           ],
418:           reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
419:             /^ENTRY       R[A-Za-z0-9\._]+$/,
420:             /^ENTRY       .+ Reaction\s*/
421:           ],
422:           genes    = RuleRegexp[ 'Bio::KEGG::GENES',
423:             /^ENTRY       .+ (CDS|gene|.*RNA|Contig) / ],
424:           genome   = RuleRegexp[ 'Bio::KEGG::GENOME',
425:             /^ENTRY       [a-z]+$/ ],
426: 
427:           fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
428:                                 'Bio::FANTOM::MaXML::Sequence') do |text|
429:             if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
430:               case $1
431:               when 'clusters'
432:                 Bio::FANTOM::MaXML::Cluster
433:               when 'sequences'
434:                 Bio::FANTOM::MaXML::Sequence
435:               else
436:                 nil #unknown
437:               end
438:             else
439:               nil
440:             end
441:           end,
442: 
443:           pdb = RuleRegexp[ 'Bio::PDB',
444:             /^HEADER    .{40}\d\d\-[A-Z]{3}\-\d\d   [0-9A-Z]{4}/ ],
445:           het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
446:             /^RESIDUE +.+ +\d+\s*$/ ],
447: 
448:           clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
449:           /^CLUSTAL .*\(.*\).*sequence +alignment/,
450:           /^CLUSTAL FORMAT for T-COFFEE/ ],
451: 
452:           gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
453:           /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],
454: 
455:           gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
456:           /^!!(N|A)A_SEQUENCE .+/ ],
457: 
458:           blastxml = RuleRegexp[ 'Bio::Blast::Report',
459:             /\<\!DOCTYPE BlastOutput PUBLIC / ],
460:           wublast  = RuleRegexp[ 'Bio::Blast::WU::Report',
461:             /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
462:           wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
463:             /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
464:           blast    = RuleRegexp[ 'Bio::Blast::Default::Report',
465:             /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
466:           tblast   = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
467:             /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
468:           rpsblast   = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
469:             /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
470: 
471:           blat   = RuleRegexp[ 'Bio::Blat::Report',
472:             /^psLayout version \d+/ ],
473:           spidey = RuleRegexp[ 'Bio::Spidey::Report',
474:             /^\-\-SPIDEY version .+\-\-$/ ],
475:           hmmer  = RuleRegexp[ 'Bio::HMMER::Report',
476:             /^HMMER +\d+\./ ],
477:           sim4   = RuleRegexp[ 'Bio::Sim4::Report',
478:             /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],
479: 
480:           fastq  = RuleRegexp[ 'Bio::Fastq',
481:             /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+\+.*(?:\r|\r?\n).+(?:\r|\r?\n)/ ],
482: 
483:           fastaformat = RuleProc.new('Bio::FastaFormat',
484:                                      'Bio::NBRF',
485:                                      'Bio::FastaNumericFormat') do |text|
486:             if /^>.+$/ =~ text
487:               case text
488:               when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
489:                 Bio::NBRF
490:               when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
491:                   Bio::FastaFormat
492:               when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
493:                 Bio::FastaNumericFormat
494:               else
495:                 false
496:               end
497:             else
498:               nil
499:             end
500:           end
501:         ]
502: 
503:         # dependencies
504:         # NCBI
505:         genbank.is_prior_to genpept
506:         # EMBL/UniProt
507:         embl.is_prior_to sptr
508:         sptr.is_prior_to prosite
509:         prosite.is_prior_to transfac
510:         # KEGG
511:         #aaindex.is_prior_to litdb
512:         #litdb.is_prior_to brite
513:         brite.is_prior_to orthology
514:         orthology.is_prior_to drug
515:         drug.is_prior_to glycan
516:         glycan.is_prior_to enzyme
517:         enzyme.is_prior_to compound
518:         compound.is_prior_to reaction
519:         reaction.is_prior_to genes
520:         genes.is_prior_to genome
521:         # PDB
522:         pdb.is_prior_to het
523:         # BLAST
524:         wublast.is_prior_to wutblast
525:         wutblast.is_prior_to blast
526:         blast.is_prior_to tblast
527:         # Fastq
528:         BottomRule.is_prior_to(fastq)
529:         fastq.is_prior_to(fastaformat)
530:         # FastaFormat
531:         BottomRule.is_prior_to(fastaformat)
532: 
533:         # for debug
534:         #debug_first = RuleDebug.new('debug_first')
535:         #a.add(debug_first)
536:         #debug_first.is_prior_to(TopRule)
537: 
538:         ## for debug
539:         #debug_last = RuleDebug.new('debug_last')
540:         #a.add(debug_last)
541:         #BottomRule.is_prior_to(debug_last)
542:         #fastaformat.is_prior_to(debug_last)
543: 
544:         a.rehash
545:         return a
546:       end

Creates a new Autodetect object

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 226
226:       def initialize
227:         # stores autodetection rules.
228:         @rules = Hash.new
229:         # stores elements (cache)
230:         @elements = nil
231:         self.add(TopRule)
232:         self.add(BottomRule)
233:       end

Public Instance methods

Adds a new element. Returns elem.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 237
237:       def add(elem)
238:         raise 'element name conflicts' if @rules[elem.name]
239:         @elements = nil
240:         @rules[elem.name] = elem
241:         elem
242:       end

Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 305
305:       def autodetect(text, meta = {})
306:         r = nil
307:         elements.each do |e|
308:           #$stderr.puts e.name
309:           r = e.guess(text, meta)
310:           break if r
311:         end
312:         r
313:       end

autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 318
318:       def autodetect_flatfile(ff, lines = 31)
319:         meta = {}
320:         stream = ff.instance_eval { @stream }
321:         begin
322:           path = stream.path
323:         rescue NameError
324:         end
325:         if path then
326:           meta[:path] = path
327:           # call autodetect onece with meta and without any read action
328:           if r = self.autodetect(stream.prefetch_buffer, meta)
329:             return r
330:           end
331:         end
332:         # reading stream
333:         1.upto(lines) do |x|
334:           break unless line = stream.prefetch_gets
335:           if line.strip.size > 0 then
336:             if r = self.autodetect(stream.prefetch_buffer, meta)
337:               return r
338:             end
339:           end
340:         end
341:         return nil
342:       end

Iterates over each element.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 298
298:       def each_rule(&x) #:yields: elem
299:         elements.each(&x)
300:       end

Returns current elements as an array whose order fulfills all elements’ priorities.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 275
275:       def elements
276:         unless @elements
277:           ary = tsort
278:           ary.reverse!
279:           @elements = ary
280:         end
281:         @elements
282:       end

visualizes the object (mainly for debug)

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 291
291:       def inspect
292:         "<#{self.class.to_s} " +
293:           self.elements.collect { |e| e.name.inspect }.join(' ') +
294:           ">"
295:       end

rebuilds the object and clears internal cache.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 285
285:       def rehash
286:         @rules.rehash
287:         @elements = nil
288:       end

(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 253
253:       def tsort_each_child(elem)
254:         if elem == TopRule then
255:           @rules.each_value do |e|
256:             yield e unless e == TopRule or 
257:               e.lower_priority_elements.index(TopRule)
258:           end
259:         elsif elem == BottomRule then
260:           @rules.each_value do |e|
261:             yield e if e.higher_priority_elements.index(BottomRule)
262:           end
263:         else
264:           elem.lower_priority_elements.each do |e|
265:             yield e if e != BottomRule
266:           end
267:           unless elem.higher_priority_elements.index(BottomRule)
268:             yield BottomRule
269:           end
270:         end
271:       end

(required by TSort.) For all elements, yields each element.

[Source]

     # File lib/bio/io/flatfile/autodetection.rb, line 246
246:       def tsort_each_node(&x)
247:         @rules.each_value(&x)
248:       end

[Validate]