Class Bio::FastaDefline
In: lib/bio/db/fasta/defline.rb
Parent: Object

Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or ":"-separated IDs.

specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers

Examples

  rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
  rub.entry_id       ==> 'gi|671595'
  rub.get('emb')     ==> 'CAA85678.1'
  rub.emb            ==> 'CAA85678.1'
  rub.gi             ==> '671595'
  rub.accession      ==> 'CAA85678'
  rub.accessions     ==> [ 'CAA85678' ]
  rub.acc_version    ==> 'CAA85678.1'
  rub.locus          ==> nil
  rub.list_ids       ==> [["gi", "671595"],
                          ["emb", "CAA85678.1", nil],
                          ["Perovskia abrotanoides"]]

  ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
  ckr.entry_id      ==> "gi|2495000"
  ckr.sp            ==> "CCKR_CAVPO"
  ckr.pir           ==> "I51898"
  ckr.gb            ==> "AAB29504.1"
  ckr.gi            ==> "2495000"
  ckr.accession     ==> "AAB29504"
  ckr.accessions    ==> ["Q63931", "AAB29504"]
  ckr.acc_version   ==> "AAB29504.1"
  ckr.locus         ==> nil
  ckr.description   ==>
    "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
  ckr.descriptions  ==>
    ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
     "cholecystokinin A receptor - guinea pig",
     "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
  ckr.words         ==>
    ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
     "receptor", "type"]
  ckr.id_strings    ==>
    ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
     "544724", "AAB29504.1", "Cavia"]
  ckr.list_ids      ==>
    [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
     ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
     ["gb", "AAB29504.1", nil], ["Cavia"]]

Refereneces

Methods

Constants

NSIDs = { # NCBI and WU-BLAST 'gi' => [ 'gi' ], # NCBI GI 'gb' => [ 'acc_version', 'locus' ], # GenBank 'emb' => [ 'acc_version', 'locus' ], # EMBL 'dbj' => [ 'acc_version', 'locus' ], # DDBJ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT 'pdb' => [ 'entry_id', 'chain' ], # PDB 'bbs' => [ 'number' ], # GenInfo Backbone Id 'gnl' => [ 'database' , 'entry_id' ], # General database identifier 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence 'lcl' => [ 'entry_id' ], # Local Sequence identifier # WU-BLAST and NCBI 'pir' => [ 'accession', 'entry_id' ], # PIR 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation 'pat' => [ 'country', 'number', 'serial' ], # Patents # WU-BLAST only 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier 'gim' => [ 'number' ], # NCBI GenInfo Import identifier 'gp' => [ 'acc_version', 'locus' ], # GenPept 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank # Original 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB }
KillWords = [ 'an', 'the', 'this', 'that', 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might', 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with', 'from', 'and', 'or', 'not', 'dna', 'rna', 'mrna', 'cdna', 'orf', 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp', 'similar', 'involved', 'identical', 'identity', 'cds', 'clone', 'library', 'contig', 'contigs', 'homolog', 'homologue', 'homologs', 'homologous', 'protein', 'proteins', 'gene', 'genes', 'product', 'products', 'sequence', 'sequences', 'strain', 'strains', 'region', 'regions', ]
KillWordsHash = {}
KillRegexpArray = [ /\A\d{1,3}\%?\z/, /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/, /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/

Attributes

entry_id  [R]  Shows a possibly unique identifier. Returns a string.
list_ids  [R]  Shows array that contains IDs (or ID-like strings). Returns an array of arrays of strings.

Public Class methods

Parses given string.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 176
176:     def initialize(str)
177:       @deflines = []
178:       @info = {}
179:       @list_ids = []
180: 
181:       @entry_id = nil
182: 
183:       lines = str.split("\x01")
184:       lines.each do |line|
185:         add_defline(line)
186:       end
187:     end

Public Instance methods

Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 489
489:     def acc_version
490:       unless defined?(@acc_version) then
491:         @acc_version = get_by_type('acc_version')
492:       end
493:       @acc_version
494:     end

Shows an accession number.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 507
507:     def accession
508:       unless defined?(@accession) then
509:         if acc_version then
510:           @accession = acc_version.split('.')[0]
511:         else
512:           @accession = accessions[0]
513:         end
514:       end
515:       @accession
516:     end

Shows accession numbers. Returns an array of strings.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 498
498:     def accessions
499:       unless defined?(@accessions) then
500:         @accessions = get_all_by_type('accession', 'acc_version')
501:         @accessions.collect! { |x| x.sub(/\..*\z/, '') }
502:       end
503:       @accessions
504:     end

Parses given string and adds parsed data.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 190
190:     def add_defline(str)
191:       case str
192:       when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
193:         # NSIDs
194:         # examples:
195:         # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
196:         #
197:         # note: regexp (:?) means grouping without backreferences
198:         i = $1
199:         d = $2
200:         tks = i.split('|')
201:         tks << '' if i[-1,1] == '|'
202:         a = parse_NSIDs(tks)
203:         i = a[0].join('|')
204:         a.unshift('|')
205:         d = tks.join('|') + ' ' + d unless tks.empty?
206:         a << d
207:         this_line = a
208:         match_EC(d)
209:         parse_square_brackets(d).each do |x|
210:           if !match_EC(x, false) and x =~ /\A[A-Z]/ then
211:             di = [  x ]
212:             @list_ids << di
213:             @info['organism'] = x unless @info['organism']
214:           end
215:         end
216: 
217:       when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
218:         # examples:
219:         # >sce:YBR160W  CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
220:         # >emb:CACDC28 [X80034] C.albicans CDC28 gene 
221:         i = $1
222:         d = $2
223:         a = parse_ColonSepID(i)
224:         i = a.join(':')
225:         this_line = [ ':', a , d ]
226:         match_EC(d)
227:         parse_square_brackets(d).each do |x|
228:           if !match_EC(x, false) and x =~ /:/ then
229:             parse_ColonSepID(x)
230:           elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
231:             @list_ids << [ $1 ]
232:           end
233:         end
234: 
235:       when /^\>?\s*(\S+)(?:\s+(.+))?$/
236:         # examples:
237:         # >ABC12345 this is test
238:         i = $1
239:         d = $2.to_s
240:         @list_ids << [ i.chomp('.') ]
241:         this_line = [  '', [ i ], d ]
242:         match_EC(d)
243:       else
244:         i = str
245:         d = ''
246:         match_EC(i)
247:         this_line = [ '', [ i ], d ]
248:       end
249: 
250:       @deflines << this_line
251:       @entry_id = i unless @entry_id
252:     end

Shows description.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 332
332:     def description
333:       @deflines[0].to_a[-1]
334:     end

Returns descriptions.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 337
337:     def descriptions
338:       @deflines.collect do |a|
339:         a[-1]
340:       end
341:     end

Returns identifires by a database name.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 413
413:     def get(dbname)
414:       db = dbname.to_s
415:       r = nil
416:       unless r = @info[db] then
417:         di = @list_ids.find { |x| x[0] == db.to_s }
418:         if di and di.size <= 2 then
419:           r = di[-1]
420:         elsif di then
421:           labels = self.class::NSIDs[db]
422:           [ 'acc_version', 'entry_id',
423:             'locus', 'accession', 'number'].each do |x|
424:             if i = labels.index(x) then
425:               r = di[i+1]
426:               break if r
427:             end
428:           end
429:           r = di[1..-1].find { |x| x } unless r
430:         end
431:         @info[db] = r if r
432:       end
433:       r
434:     end

Returns identifiers by given type.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 449
449:     def get_all_by_type(*type_strarg)
450:       d = []
451:       @list_ids.each do |x|
452:         if labels = self.class::NSIDs[x[0]] then
453:           type_strarg.each do |y|
454:             if i = labels.index(y) then
455:               d << x[i+1] if x[i+1]
456:             end
457:           end
458:         end
459:       end
460:       d
461:     end

Returns an identifier by given type.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 437
437:     def get_by_type(type_str)
438:       @list_ids.each do |x|
439:         if labels = self.class::NSIDs[x[0]] then
440:           if i = labels.index(type_str) then
441:             return x[i+1]
442:           end
443:         end
444:       end
445:       nil
446:     end

Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 478
478:     def gi
479:       unless defined?(@gi) then
480:         @gi = get_by_type('gi')
481:       end
482:       @gi
483:     end

Shows ID-like strings. Returns an array of strings.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 345
345:     def id_strings
346:       r = []
347:       @list_ids.each do |a|
348:         if a.size >= 2 then
349:           r.concat a[1..-1].find_all { |x| x }
350:         else
351:           if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
352:             r << a[0]
353:           end
354:         end
355:       end
356:       r.concat( words(true, []).find_all do |x|
357:                  x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
358:                    x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
359:                end)
360:       r
361:     end

Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 467
467:     def locus
468:       unless defined?(@locus)
469:         @locus = get_by_type('locus')
470:       end
471:       @locus
472:     end

[Source]

     # File lib/bio/db/fasta/defline.rb, line 518
518:     def method_missing(name, *args)
519:       # raise ArgumentError,
520:       # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
521:       r = get(name, *args)
522:       if !r and !(self.class::NSIDs[name.to_s]) then
523:         raise "NameError: undefined method `#{name.inspect}'"
524:       end
525:       r
526:     end

Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 324
324:     def to_s
325:       @deflines.collect { |a|
326:         s = a[0]
327:         (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
328:       }.join("\x01")
329:     end

Shows words used in the defline. Returns an Array.

[Source]

     # File lib/bio/db/fasta/defline.rb, line 387
387:     def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
388:               kwhash = self.class::KillWordsHash)
389:       a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
390:       a.collect! do |x|
391:         x.sub!(/\A[\$\*\-\+]+/, '')
392:         x.sub!(/[\$\*\-\=]+\z/, '')
393:         if x.size <= 1 then
394:           nil
395:         elsif kwhash[x.downcase] then
396:           nil
397:         else
398:           if kill_regexp.find { |expr| expr =~ x } then
399:             nil
400:           else
401:             x
402:           end
403:         end
404:       end
405:       a.compact!
406:       a.collect! { |x| x.downcase } unless case_sensitive
407:       a.sort!
408:       a.uniq!
409:       a
410:     end

[Validate]