Class | Bio::REBASE |
In: |
lib/bio/db/rebase.rb
|
Parent: | Object |
bio/db/rebase.rb - Interface for EMBOSS formatted REBASE files
Author: | Trevor Wennblom <trevor@corevx.com> |
Copyright: | Copyright (c) 2005-2007 Midwinter Laboratories, LLC (midwinterlabs.com) |
License: | The Ruby License |
Bio::REBASE provides utilties for interacting with REBASE data in EMBOSS format. REBASE is the Restriction Enzyme Database, more information can be found here:
EMBOSS formatted files located at:
These files are the same as the "emboss_?.???" files located at:
To easily get started with the data you can simply type this command at your shell prompt:
% wget "ftp://ftp.neb.com/pub/rebase/emboss_*"
require 'bio' require 'pp' enz = File.read('emboss_e') ref = File.read('emboss_r') sup = File.read('emboss_s') # When creating a new instance of Bio::REBASE # the contents of the enzyme file must be passed. # The references and suppiers file contents # may also be passed. rebase = Bio::REBASE.new( enz ) rebase = Bio::REBASE.new( enz, ref ) rebase = Bio::REBASE.new( enz, ref, sup ) # The 'read' class method allows you to read in files # that are REBASE EMBOSS formatted rebase = Bio::REBASE.read( 'emboss_e' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' ) # The data loaded may be saved in YAML format rebase.save_yaml( 'enz.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' ) # YAML formatted files can also be read with the # class method 'load_yaml' rebase = Bio::REBASE.load_yaml( 'enz.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' ) pp rebase.enzymes[0..4] # ["AarI", "AasI", "AatI", "AatII", "Acc16I"] pp rebase.enzyme_name?('aasi') # true pp rebase['AarI'].pattern # "CACCTGC" pp rebase['AarI'].blunt? # false pp rebase['AarI'].organism # "Arthrobacter aurescens SS2-322" pp rebase['AarI'].source # "A. Janulaitis" pp rebase['AarI'].primary_strand_cut1 # 11 pp rebase['AarI'].primary_strand_cut2 # 0 pp rebase['AarI'].complementary_strand_cut1 # 15 pp rebase['AarI'].complementary_strand_cut2 # 0 pp rebase['AarI'].suppliers # ["F"] pp rebase['AarI'].supplier_names # ["Fermentas International Inc."] pp rebase['AarI'].isoschizomers # Currently none stored in the references file pp rebase['AarI'].methylation # "" pp rebase['EcoRII'].methylation # "2(5)" pp rebase['EcoRII'].suppliers # ["F", "J", "M", "O", "S"] pp rebase['EcoRII'].supplier_names # ["Fermentas International Inc.", "Nippon Gene Co., Ltd.", # "Roche Applied Science", "Toyobo Biochemicals", # "Sigma Chemical Corporation"] # Number of enzymes in the database pp rebase.size # 673 pp rebase.enzymes.size # 673 rebase.each do |name, info| pp "#{name}: #{info.methylation}" unless info.methylation.empty? end
Read YAML formatted files
rebase = Bio::REBASE.load_yaml( 'enz.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml' ) rebase = Bio::REBASE.load_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
Arguments
Returns: | Bio::REBASE object |
# File lib/bio/db/rebase.rb, line 258 258: def self.load_yaml( f_enzyme, f_reference=nil, f_supplier=nil ) 259: e = YAML.load_file(f_enzyme) 260: r = f_reference ? YAML.load_file(f_reference) : nil 261: s = f_supplier ? YAML.load_file(f_supplier) : nil 262: self.new(e,r,s,true) 263: end
Constructor
Arguments
Returns: | Bio::REBASE |
# File lib/bio/db/rebase.rb, line 174 174: def initialize( enzyme_lines, reference_lines = nil, supplier_lines = nil, yaml = false ) 175: # All your REBASE are belong to us. 176: 177: if yaml 178: @enzyme_data = enzyme_lines 179: @reference_data = reference_lines 180: @supplier_data = supplier_lines 181: else 182: @enzyme_data = parse_enzymes(enzyme_lines) 183: @reference_data = parse_references(reference_lines) 184: @supplier_data = parse_suppliers(supplier_lines) 185: end 186: 187: EnzymeEntry.supplier_data = @supplier_data 188: setup_enzyme_data 189: end
Read REBASE EMBOSS-formatted files
rebase = Bio::REBASE.read( 'emboss_e' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r' ) rebase = Bio::REBASE.read( 'emboss_e', 'emboss_r', 'emboss_s' )
Arguments
Returns: | Bio::REBASE object |
# File lib/bio/db/rebase.rb, line 240 240: def self.read( f_enzyme, f_reference=nil, f_supplier=nil ) 241: e = IO.readlines(f_enzyme) 242: r = f_reference ? IO.readlines(f_reference) : nil 243: s = f_supplier ? IO.readlines(f_supplier) : nil 244: self.new(e,r,s) 245: end
Check if supplied name is the name of an available enzyme
Arguments
Returns: | +true/false+ |
# File lib/bio/db/rebase.rb, line 207 207: def enzyme_name?(name) 208: @enzyme_names_downcased.include?(name.downcase) 209: end
Save the current data
rebase.save_yaml( 'enz.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml' ) rebase.save_yaml( 'enz.yaml', 'ref.yaml', 'sup.yaml' )
Arguments
Returns: | nothing |
# File lib/bio/db/rebase.rb, line 222 222: def save_yaml( f_enzyme, f_reference=nil, f_supplier=nil ) 223: File.open(f_enzyme, 'w') { |f| f.puts YAML.dump(@enzyme_data) } 224: File.open(f_reference, 'w') { |f| f.puts YAML.dump(@reference_data) } if f_reference 225: File.open(f_supplier, 'w') { |f| f.puts YAML.dump(@supplier_data) } if f_supplier 226: return 227: end
data is a hash indexed by the :name of each entry which is also a hash
# File lib/bio/db/rebase.rb, line 314 314: def parse_enzymes( lines ) 315: data = {} 316: return data if lines == nil 317: lines.each_line do |line| 318: next if line[0].chr == '#' 319: line.chomp! 320: 321: a = line.split("\s") 322: 323: data[ a[0] ] = { 324: :name => a[0], 325: :pattern => a[1], 326: :len => a[2], 327: :ncuts => a[3], 328: :blunt => a[4], 329: :c1 => a[5], 330: :c2 => a[6], 331: :c3 => a[7], 332: :c4 => a[8] 333: } 334: end # lines.each 335: data 336: end
data is a hash indexed by the :name of each entry which is also a hash
# File lib/bio/db/rebase.rb, line 341 341: def parse_references( lines ) 342: data = {} 343: return data if lines == nil 344: index = 1 345: h = {} 346: references_left = 0 347: 348: lines.each_line do |line| 349: next if line[0].chr == '#' # Comment 350: next if line[0..1] == '//' # End of entry marker 351: line.chomp! 352: 353: if (1..7).include?( index ) 354: h[index] = line 355: references_left = h[index].to_i if index == 7 356: index += 1 357: next 358: end 359: 360: if index == 8 361: h[index] ||= [] 362: h[index] << line 363: references_left -= 1 364: end 365: 366: if references_left == 0 367: data[ h[1] ] = { 368: :name => h[1], 369: :organism => h[2], 370: :isoschizomers => h[3], 371: :methylation => h[4], 372: :source => h[5], 373: :suppliers => h[6], 374: :number_of_references => h[7], 375: :references => h[8] 376: } 377: index = 1 378: h = {} 379: end 380: end # lines.each 381: data 382: end
data is a hash indexed by the supplier code
data[supplier_code] returns the suppliers name
# File lib/bio/db/rebase.rb, line 387 387: def parse_suppliers( lines ) 388: data = {} 389: return data if lines == nil 390: lines.each_line do |line| 391: next if line[0].chr == '#' 392: data[$1] = $2 if line =~ %r{(.+?)\s(.+)} 393: end 394: data 395: end
Takes a string in one of the three formats listed below and returns a Bio::Reference object
# File lib/bio/db/rebase.rb, line 403 403: def raw_to_reference( line ) 404: a = line.split(', ') 405: 406: if a[-1] == 'Unpublished observations.' 407: title = a.pop.chop 408: pages = volume = year = journal = '' 409: else 410: title = '' 411: 412: pages_or_volume = a.pop.chop 413: if pages_or_volume =~ %r{pp\.\s} 414: pages = pages_or_volume 415: pages.gsub!('pp. ', '') 416: volume = a.pop 417: else 418: pages = '' 419: volume = pages_or_volume 420: end 421: 422: volume.gsub!('vol. ', '') 423: 424: year_and_journal = a.pop 425: year_and_journal =~ %r{\((\d+)\)\s(.+)} 426: year = $1 427: journal = $2 428: end 429: 430: authors = [] 431: 432: last_name = nil 433: a.each do |e| 434: if last_name 435: authors << "#{last_name}, #{e}" 436: last_name = nil 437: else 438: last_name = e 439: end 440: end 441: 442: ref = { 443: 'title' => title, 444: 'pages' => pages, 445: 'volume' => volume, 446: 'year' => year, 447: 'journal' => journal, 448: 'authors' => authors, 449: } 450: 451: Bio::Reference.new(ref) 452: end
# File lib/bio/db/rebase.rb, line 295 295: def setup_enzyme_and_reference_association 296: return unless @reference_data 297: @reference_data.each do |name, hash| 298: d = @data[name] 299: [:organism, :isoschizomers, 300: :methylation, :source].each { |k| d[k] = hash[k] } 301: d.suppliers = hash[:suppliers].split('') 302: d.references = [] 303: hash[:references].each { |k| d.references << raw_to_reference(k) } 304: end 305: end
# File lib/bio/db/rebase.rb, line 269 269: def setup_enzyme_data 270: @data = {} 271: 272: @enzyme_data.each do |name, hash| 273: @data[name] = EnzymeEntry.new 274: d = @data[name] 275: d.pattern = hash[:pattern] 276: # d.blunt?= is a syntax error 277: d[:blunt?] = (hash[:blunt].to_i == 1 ? true : false) 278: d.primary_strand_cut1 = hash[:c1].to_i 279: d.complementary_strand_cut1 = hash[:c2].to_i 280: d.primary_strand_cut2 = hash[:c3].to_i 281: d.complementary_strand_cut2 = hash[:c4].to_i 282: 283: # Set up keys just in case there's no reference data supplied 284: [:organism, :isoschizomers, 285: :methylation, :source].each { |k| d[k] = '' } 286: d.suppliers = [] 287: d.references = [] 288: end 289: 290: @enzyme_names = @data.keys.sort 291: @enzyme_names_downcased = @enzyme_names.map{|a| a.downcase} 292: setup_enzyme_and_reference_association 293: end