Class | Bio::FlatFile::AutoDetect |
In: |
lib/bio/io/flatfile/autodetection.rb
|
Parent: | Object |
AutoDetect automatically determines database class of given data.
TopRule | = | RuleSpecial.new('top') | Special element that is always top priority. | |
BottomRule | = | RuleSpecial.new('bottom') | Special element that is always bottom priority. |
make a new autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 361 361: def self.[](*arg) 362: a = self.new 363: arg.each { |e| a.add(e) } 364: a 365: end
returns the default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 348 348: def self.default 349: unless @default then 350: @default = self.make_default 351: end 352: @default 353: end
sets the default autodetect object.
# File lib/bio/io/flatfile/autodetection.rb, line 356 356: def self.default=(ad) 357: @default = ad 358: end
make a default of default autodetect object
# File lib/bio/io/flatfile/autodetection.rb, line 368 368: def self.make_default 369: a = self[ 370: genbank = RuleRegexp[ 'Bio::GenBank', 371: /^LOCUS .+ bp .*[a-z]*[DR]?NA/ ], 372: genpept = RuleRegexp[ 'Bio::GenPept', 373: /^LOCUS .+ aa .+/ ], 374: medline = RuleRegexp[ 'Bio::MEDLINE', 375: /^PMID\- [0-9]+$/ ], 376: embl = RuleRegexp[ 'Bio::EMBL', 377: /^ID .+\; .*(DNA|RNA|XXX)\;/ ], 378: sptr = RuleRegexp2[ 'Bio::SPTR', 379: /^ID .+\; *PRT\;/, 380: /^ID [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ], 381: prosite = RuleRegexp[ 'Bio::PROSITE', 382: /^ID [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ], 383: transfac = RuleRegexp[ 'Bio::TRANSFAC', 384: /^AC [-A-Za-z0-9_\.]+$/ ], 385: 386: aaindex = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text| 387: if /^H [-A-Z0-9_\.]+$/ =~ text then 388: if text =~ /^M [rc]/ then 389: Bio::AAindex2 390: elsif text =~ /^I A\/L/ then 391: Bio::AAindex1 392: else 393: false #fail to determine 394: end 395: else 396: nil 397: end 398: end, 399: 400: litdb = RuleRegexp[ 'Bio::LITDB', 401: /^CODE [0-9]+$/ ], 402: brite = RuleRegexp[ 'Bio::KEGG::BRITE', 403: /^Entry [A-Z0-9]+/ ], 404: orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY', 405: /^ENTRY .+ KO\s*/ ], 406: drug = RuleRegexp[ 'Bio::KEGG::DRUG', 407: /^ENTRY .+ Drug\s*/ ], 408: glycan = RuleRegexp[ 'Bio::KEGG::GLYCAN', 409: /^ENTRY .+ Glycan\s*/ ], 410: enzyme = RuleRegexp2[ 'Bio::KEGG::ENZYME', 411: /^ENTRY EC [0-9\.]+$/, 412: /^ENTRY .+ Enzyme\s*/ 413: ], 414: compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND', 415: /^ENTRY C[A-Za-z0-9\._]+$/, 416: /^ENTRY .+ Compound\s*/ 417: ], 418: reaction = RuleRegexp2[ 'Bio::KEGG::REACTION', 419: /^ENTRY R[A-Za-z0-9\._]+$/, 420: /^ENTRY .+ Reaction\s*/ 421: ], 422: genes = RuleRegexp[ 'Bio::KEGG::GENES', 423: /^ENTRY .+ (CDS|gene|.*RNA|Contig) / ], 424: genome = RuleRegexp[ 'Bio::KEGG::GENOME', 425: /^ENTRY [a-z]+$/ ], 426: 427: fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster', 428: 'Bio::FANTOM::MaXML::Sequence') do |text| 429: if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text 430: case $1 431: when 'clusters' 432: Bio::FANTOM::MaXML::Cluster 433: when 'sequences' 434: Bio::FANTOM::MaXML::Sequence 435: else 436: nil #unknown 437: end 438: else 439: nil 440: end 441: end, 442: 443: pdb = RuleRegexp[ 'Bio::PDB', 444: /^HEADER .{40}\d\d\-[A-Z]{3}\-\d\d [0-9A-Z]{4}/ ], 445: het = RuleRegexp[ 'Bio::PDB::ChemicalComponent', 446: /^RESIDUE +.+ +\d+\s*$/ ], 447: 448: clustal = RuleRegexp2[ 'Bio::ClustalW::Report', 449: /^CLUSTAL .*\(.*\).*sequence +alignment/, 450: /^CLUSTAL FORMAT for T-COFFEE/ ], 451: 452: gcg_msf = RuleRegexp[ 'Bio::GCG::Msf', 453: /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ], 454: 455: gcg_seq = RuleRegexp[ 'Bio::GCG::Seq', 456: /^!!(N|A)A_SEQUENCE .+/ ], 457: 458: blastxml = RuleRegexp[ 'Bio::Blast::Report', 459: /\<\!DOCTYPE BlastOutput PUBLIC / ], 460: wublast = RuleRegexp[ 'Bio::Blast::WU::Report', 461: /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], 462: wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast', 463: /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ], 464: blast = RuleRegexp[ 'Bio::Blast::Default::Report', 465: /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 466: tblast = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast', 467: /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 468: rpsblast = RuleRegexp[ 'Bio::Blast::RPSBlast::Report', 469: /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ], 470: 471: blat = RuleRegexp[ 'Bio::Blat::Report', 472: /^psLayout version \d+/ ], 473: spidey = RuleRegexp[ 'Bio::Spidey::Report', 474: /^\-\-SPIDEY version .+\-\-$/ ], 475: hmmer = RuleRegexp[ 'Bio::HMMER::Report', 476: /^HMMER +\d+\./ ], 477: sim4 = RuleRegexp[ 'Bio::Sim4::Report', 478: /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ], 479: 480: fastq = RuleRegexp[ 'Bio::Fastq', 481: /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+\+.*(?:\r|\r?\n).+(?:\r|\r?\n)/ ], 482: 483: fastaformat = RuleProc.new('Bio::FastaFormat', 484: 'Bio::NBRF', 485: 'Bio::FastaNumericFormat') do |text| 486: if /^>.+$/ =~ text 487: case text 488: when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/ 489: Bio::NBRF 490: when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/ 491: Bio::FastaFormat 492: when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/ 493: Bio::FastaNumericFormat 494: else 495: false 496: end 497: else 498: nil 499: end 500: end 501: ] 502: 503: # dependencies 504: # NCBI 505: genbank.is_prior_to genpept 506: # EMBL/UniProt 507: embl.is_prior_to sptr 508: sptr.is_prior_to prosite 509: prosite.is_prior_to transfac 510: # KEGG 511: #aaindex.is_prior_to litdb 512: #litdb.is_prior_to brite 513: brite.is_prior_to orthology 514: orthology.is_prior_to drug 515: drug.is_prior_to glycan 516: glycan.is_prior_to enzyme 517: enzyme.is_prior_to compound 518: compound.is_prior_to reaction 519: reaction.is_prior_to genes 520: genes.is_prior_to genome 521: # PDB 522: pdb.is_prior_to het 523: # BLAST 524: wublast.is_prior_to wutblast 525: wutblast.is_prior_to blast 526: blast.is_prior_to tblast 527: # Fastq 528: BottomRule.is_prior_to(fastq) 529: fastq.is_prior_to(fastaformat) 530: # FastaFormat 531: BottomRule.is_prior_to(fastaformat) 532: 533: # for debug 534: #debug_first = RuleDebug.new('debug_first') 535: #a.add(debug_first) 536: #debug_first.is_prior_to(TopRule) 537: 538: ## for debug 539: #debug_last = RuleDebug.new('debug_last') 540: #a.add(debug_last) 541: #BottomRule.is_prior_to(debug_last) 542: #fastaformat.is_prior_to(debug_last) 543: 544: a.rehash 545: return a 546: end
Autodetect from the text. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 305 305: def autodetect(text, meta = {}) 306: r = nil 307: elements.each do |e| 308: #$stderr.puts e.name 309: r = e.guess(text, meta) 310: break if r 311: end 312: r 313: end
autodetect from the FlatFile object. Returns a database class if succeeded. Returns nil if failed.
# File lib/bio/io/flatfile/autodetection.rb, line 318 318: def autodetect_flatfile(ff, lines = 31) 319: meta = {} 320: stream = ff.instance_eval { @stream } 321: begin 322: path = stream.path 323: rescue NameError 324: end 325: if path then 326: meta[:path] = path 327: # call autodetect onece with meta and without any read action 328: if r = self.autodetect(stream.prefetch_buffer, meta) 329: return r 330: end 331: end 332: # reading stream 333: 1.upto(lines) do |x| 334: break unless line = stream.prefetch_gets 335: if line.strip.size > 0 then 336: if r = self.autodetect(stream.prefetch_buffer, meta) 337: return r 338: end 339: end 340: end 341: return nil 342: end
Iterates over each element.
# File lib/bio/io/flatfile/autodetection.rb, line 298 298: def each_rule(&x) #:yields: elem 299: elements.each(&x) 300: end
visualizes the object (mainly for debug)
# File lib/bio/io/flatfile/autodetection.rb, line 291 291: def inspect 292: "<#{self.class.to_s} " + 293: self.elements.collect { |e| e.name.inspect }.join(' ') + 294: ">" 295: end
rebuilds the object and clears internal cache.
# File lib/bio/io/flatfile/autodetection.rb, line 285 285: def rehash 286: @rules.rehash 287: @elements = nil 288: end
(required by TSort.) For a given element, yields each child (= lower priority elements) of the element.
# File lib/bio/io/flatfile/autodetection.rb, line 253 253: def tsort_each_child(elem) 254: if elem == TopRule then 255: @rules.each_value do |e| 256: yield e unless e == TopRule or 257: e.lower_priority_elements.index(TopRule) 258: end 259: elsif elem == BottomRule then 260: @rules.each_value do |e| 261: yield e if e.higher_priority_elements.index(BottomRule) 262: end 263: else 264: elem.lower_priority_elements.each do |e| 265: yield e if e != BottomRule 266: end 267: unless elem.higher_priority_elements.index(BottomRule) 268: yield BottomRule 269: end 270: end 271: end