Class | Bio::FastaDefline |
In: |
lib/bio/db/fasta/defline.rb
|
Parent: | Object |
Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or ":"-separated IDs.
specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]') rub.entry_id ==> 'gi|671595' rub.get('emb') ==> 'CAA85678.1' rub.emb ==> 'CAA85678.1' rub.gi ==> '671595' rub.accession ==> 'CAA85678' rub.accessions ==> [ 'CAA85678' ] rub.acc_version ==> 'CAA85678.1' rub.locus ==> nil rub.list_ids ==> [["gi", "671595"], ["emb", "CAA85678.1", nil], ["Perovskia abrotanoides"]] ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]") ckr.entry_id ==> "gi|2495000" ckr.sp ==> "CCKR_CAVPO" ckr.pir ==> "I51898" ckr.gb ==> "AAB29504.1" ckr.gi ==> "2495000" ckr.accession ==> "AAB29504" ckr.accessions ==> ["Q63931", "AAB29504"] ckr.acc_version ==> "AAB29504.1" ckr.locus ==> nil ckr.description ==> "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)" ckr.descriptions ==> ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)", "cholecystokinin A receptor - guinea pig", "cholecystokinin A receptor; CCK-A receptor [Cavia]"] ckr.words ==> ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig", "receptor", "type"] ckr.id_strings ==> ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898", "544724", "AAB29504.1", "Cavia"] ckr.list_ids ==> [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"], ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"], ["gb", "AAB29504.1", nil], ["Cavia"]]
NSIDs | = | { # NCBI and WU-BLAST 'gi' => [ 'gi' ], # NCBI GI 'gb' => [ 'acc_version', 'locus' ], # GenBank 'emb' => [ 'acc_version', 'locus' ], # EMBL 'dbj' => [ 'acc_version', 'locus' ], # DDBJ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT 'pdb' => [ 'entry_id', 'chain' ], # PDB 'bbs' => [ 'number' ], # GenInfo Backbone Id 'gnl' => [ 'database' , 'entry_id' ], # General database identifier 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence 'lcl' => [ 'entry_id' ], # Local Sequence identifier # WU-BLAST and NCBI 'pir' => [ 'accession', 'entry_id' ], # PIR 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation 'pat' => [ 'country', 'number', 'serial' ], # Patents # WU-BLAST only 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier 'gim' => [ 'number' ], # NCBI GenInfo Import identifier 'gp' => [ 'acc_version', 'locus' ], # GenPept 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank # Original 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB } |
KillWords | = | [ 'an', 'the', 'this', 'that', 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might', 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with', 'from', 'and', 'or', 'not', 'dna', 'rna', 'mrna', 'cdna', 'orf', 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp', 'similar', 'involved', 'identical', 'identity', 'cds', 'clone', 'library', 'contig', 'contigs', 'homolog', 'homologue', 'homologs', 'homologous', 'protein', 'proteins', 'gene', 'genes', 'product', 'products', 'sequence', 'sequences', 'strain', 'strains', 'region', 'regions', ] |
KillWordsHash | = | {} |
KillRegexpArray | = | [ /\A\d{1,3}\%?\z/, /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/, /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ |
entry_id | [R] | Shows a possibly unique identifier. Returns a string. |
list_ids | [R] | Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings. |
Parses given string.
# File lib/bio/db/fasta/defline.rb, line 176 176: def initialize(str) 177: @deflines = [] 178: @info = {} 179: @list_ids = [] 180: 181: @entry_id = nil 182: 183: lines = str.split("\x01") 184: lines.each do |line| 185: add_defline(line) 186: end 187: end
Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 489 489: def acc_version 490: unless defined?(@acc_version) then 491: @acc_version = get_by_type('acc_version') 492: end 493: @acc_version 494: end
Shows accession numbers. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 498 498: def accessions 499: unless defined?(@accessions) then 500: @accessions = get_all_by_type('accession', 'acc_version') 501: @accessions.collect! { |x| x.sub(/\..*\z/, '') } 502: end 503: @accessions 504: end
Parses given string and adds parsed data.
# File lib/bio/db/fasta/defline.rb, line 190 190: def add_defline(str) 191: case str 192: when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/ 193: # NSIDs 194: # examples: 195: # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P 196: # 197: # note: regexp (:?) means grouping without backreferences 198: i = $1 199: d = $2 200: tks = i.split('|') 201: tks << '' if i[-1,1] == '|' 202: a = parse_NSIDs(tks) 203: i = a[0].join('|') 204: a.unshift('|') 205: d = tks.join('|') + ' ' + d unless tks.empty? 206: a << d 207: this_line = a 208: match_EC(d) 209: parse_square_brackets(d).each do |x| 210: if !match_EC(x, false) and x =~ /\A[A-Z]/ then 211: di = [ x ] 212: @list_ids << di 213: @info['organism'] = x unless @info['organism'] 214: end 215: end 216: 217: when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/ 218: # examples: 219: # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST] 220: # >emb:CACDC28 [X80034] C.albicans CDC28 gene 221: i = $1 222: d = $2 223: a = parse_ColonSepID(i) 224: i = a.join(':') 225: this_line = [ ':', a , d ] 226: match_EC(d) 227: parse_square_brackets(d).each do |x| 228: if !match_EC(x, false) and x =~ /:/ then 229: parse_ColonSepID(x) 230: elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then 231: @list_ids << [ $1 ] 232: end 233: end 234: 235: when /^\>?\s*(\S+)(?:\s+(.+))?$/ 236: # examples: 237: # >ABC12345 this is test 238: i = $1 239: d = $2.to_s 240: @list_ids << [ i.chomp('.') ] 241: this_line = [ '', [ i ], d ] 242: match_EC(d) 243: else 244: i = str 245: d = '' 246: match_EC(i) 247: this_line = [ '', [ i ], d ] 248: end 249: 250: @deflines << this_line 251: @entry_id = i unless @entry_id 252: end
Shows description.
# File lib/bio/db/fasta/defline.rb, line 332 332: def description 333: @deflines[0].to_a[-1] 334: end
Returns descriptions.
# File lib/bio/db/fasta/defline.rb, line 337 337: def descriptions 338: @deflines.collect do |a| 339: a[-1] 340: end 341: end
Returns identifires by a database name.
# File lib/bio/db/fasta/defline.rb, line 413 413: def get(dbname) 414: db = dbname.to_s 415: r = nil 416: unless r = @info[db] then 417: di = @list_ids.find { |x| x[0] == db.to_s } 418: if di and di.size <= 2 then 419: r = di[-1] 420: elsif di then 421: labels = self.class::NSIDs[db] 422: [ 'acc_version', 'entry_id', 423: 'locus', 'accession', 'number'].each do |x| 424: if i = labels.index(x) then 425: r = di[i+1] 426: break if r 427: end 428: end 429: r = di[1..-1].find { |x| x } unless r 430: end 431: @info[db] = r if r 432: end 433: r 434: end
Returns identifiers by given type.
# File lib/bio/db/fasta/defline.rb, line 449 449: def get_all_by_type(*type_strarg) 450: d = [] 451: @list_ids.each do |x| 452: if labels = self.class::NSIDs[x[0]] then 453: type_strarg.each do |y| 454: if i = labels.index(y) then 455: d << x[i+1] if x[i+1] 456: end 457: end 458: end 459: end 460: d 461: end
Returns an identifier by given type.
# File lib/bio/db/fasta/defline.rb, line 437 437: def get_by_type(type_str) 438: @list_ids.each do |x| 439: if labels = self.class::NSIDs[x[0]] then 440: if i = labels.index(type_str) then 441: return x[i+1] 442: end 443: end 444: end 445: nil 446: end
Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 478 478: def gi 479: unless defined?(@gi) then 480: @gi = get_by_type('gi') 481: end 482: @gi 483: end
Shows ID-like strings. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 345 345: def id_strings 346: r = [] 347: @list_ids.each do |a| 348: if a.size >= 2 then 349: r.concat a[1..-1].find_all { |x| x } 350: else 351: if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/ 352: r << a[0] 353: end 354: end 355: end 356: r.concat( words(true, []).find_all do |x| 357: x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or 358: x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ 359: end) 360: r 361: end
# File lib/bio/db/fasta/defline.rb, line 518 518: def method_missing(name, *args) 519: # raise ArgumentError, 520: # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2 521: r = get(name, *args) 522: if !r and !(self.class::NSIDs[name.to_s]) then 523: raise "NameError: undefined method `#{name.inspect}'" 524: end 525: r 526: end
Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.
# File lib/bio/db/fasta/defline.rb, line 324 324: def to_s 325: @deflines.collect { |a| 326: s = a[0] 327: (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip 328: }.join("\x01") 329: end
Shows words used in the defline. Returns an Array.
# File lib/bio/db/fasta/defline.rb, line 387 387: def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, 388: kwhash = self.class::KillWordsHash) 389: a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/) 390: a.collect! do |x| 391: x.sub!(/\A[\$\*\-\+]+/, '') 392: x.sub!(/[\$\*\-\=]+\z/, '') 393: if x.size <= 1 then 394: nil 395: elsif kwhash[x.downcase] then 396: nil 397: else 398: if kill_regexp.find { |expr| expr =~ x } then 399: nil 400: else 401: x 402: end 403: end 404: end 405: a.compact! 406: a.collect! { |x| x.downcase } unless case_sensitive 407: a.sort! 408: a.uniq! 409: a 410: end