Class | Bio::SOFT |
In: |
lib/bio/db/soft.rb
|
Parent: | Object |
bio/db/soft.rb - Interface for SOFT formatted files
Author: | Trevor Wennblom <trevor@corevx.com> |
Copyright: | Copyright (c) 2007 Midwinter Laboratories, LLC (midwinterlabs.com) |
License: | The Ruby License |
"SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata." — GEO, National Center for Biotechnology Information
The Bio::SOFT module reads SOFT Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.
Bio::SOFT also supports the reading of SOFT DataSet files which contain one database, one dataset, and many subsets.
Format specification is located here:
SOFT data files may be directly downloaded here:
NCBI‘s Gene Expression Omnibus (GEO) is here:
If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.
The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.
Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT files in-the-wild.
Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.
require 'bio' lines = IO.readlines('GSE3457_family.soft') soft = Bio::SOFT.new(lines) soft.platform[:geo_accession] # => "GPL2092" soft.platform[:organism] # => "Populus" soft.platform[:contributor] # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"] soft.platform[:data_row_count] # => "240" soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country] soft.platform[:"contact_zip/postal_code"] # => "97331" soft.platform[:table].header # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"] soft.platform[:table].header_description # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"} soft.platform[:table].rows.size # => 240 soft.platform[:table].rows[5] # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"] soft.platform[:table].rows[5][4] # => "P. tremula x P. tremuloides" soft.platform[:table].rows[5][:organism] # => "P. tremula x P. tremuloides" soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides" soft.series[:geo_accession] # => "GSE3457" soft.series[:contributor] # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"] soft.series[:platform_id] # => "GPL2092" soft.series[:sample_id].size # => 74 soft.series[:sample_id][0..4] # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"] soft.database[:name] # => "Gene Expression Omnibus (GEO)" soft.database[:ref] # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6" soft.database[:institute] # => "NCBI NLM NIH" soft.samples.size # => 74 soft.samples[:GSM77600][:series_id] # => "GSE3457" soft.samples['GSM77600'][:series_id] # => "GSE3457" soft.samples[:GSM77600][:platform_id] # => "GPL2092" soft.samples[:GSM77600][:type] # => "RNA" soft.samples[:GSM77600][:title] # => "jst2b2" soft.samples[:GSM77600][:table].header # => ["ID_REF", "VALUE"] soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"} soft.samples[:GSM77600][:table].rows.size # => 217 soft.samples[:GSM77600][:table].rows[5] # => ["A039P68U", "8.19"] soft.samples[:GSM77600][:table].rows[5][0] # => "A039P68U" soft.samples[:GSM77600][:table].rows[5][:id_ref] # => "A039P68U" soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U" lines = IO.readlines('GDS100.soft') soft = Bio::SOFT.new(lines) soft.database[:name] # => "Gene Expression Omnibus (GEO)" soft.database[:ref] # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6" soft.database[:institute] # => "NCBI NLM NIH" soft.subsets.size # => 8 soft.subsets.keys # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"] soft.subsets[:GDS100_7] # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"} soft.subsets['GDS100_7'][:sample_id] # => "GSM548,GSM543" soft.subsets[:GDS100_7][:sample_id] # => "GSM548,GSM543" soft.subsets[:GDS100_7][:dataset_id] # => "GDS100" soft.dataset[:order] # => "none" soft.dataset[:sample_organism] # => "Escherichia coli" soft.dataset[:table].header # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"] soft.dataset[:table].rows.size # => 5764 soft.dataset[:table].rows[5] # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"] soft.dataset[:table].rows[5][4] # => "0.242" soft.dataset[:table].rows[5][:gsm549] # => "0.097" soft.dataset[:table].rows[5][:GSM549] # => "0.097" soft.dataset[:table].rows[5]['GSM549'] # => "0.097"
LINE_TYPE_ENTITY_INDICATOR | = | '^' | ||
LINE_TYPE_ENTITY_ATTRIBUTE | = | '!' | ||
LINE_TYPE_TABLE_HEADER | = | '#' | ||
TABLE_COLUMN_DELIMITER | = | "\t" | data table row defined by absence of line type character |
database | [RW] | |
dataset | [RW] | |
platform | [RW] | |
samples | [RW] | |
series | [RW] | |
subsets | [RW] |
Constructor
Arguments
Returns: | Bio::SOFT |
# File lib/bio/db/soft.rb, line 147 147: def initialize(lines=nil) 148: @database = Database.new 149: 150: @series = Series.new 151: @platform = Platform.new 152: @samples = Samples.new 153: 154: @dataset = Dataset.new 155: @subsets = Subsets.new 156: 157: process(lines) 158: end
# File lib/bio/db/soft.rb, line 381 381: def custom_raise( line_number_with_0_based_indexing, msg ) 382: raise ["Error processing input line: #{line_number_with_0_based_indexing+1}", 383: msg].join("\t") 384: end
# File lib/bio/db/soft.rb, line 354 354: def error_msg( i, extra_info=nil ) 355: case i 356: when 10 357: x = ["Lines without line-type characters are rows in a table, but", 358: "a line containing an entity indicator such as", 359: "\"#{LINE_TYPE_ENTITY_INDICATOR}SAMPLE\",", 360: "\"#{LINE_TYPE_ENTITY_INDICATOR}PLATFORM\",", 361: "or \"#{LINE_TYPE_ENTITY_INDICATOR}DATASET\" has not been", 362: "previously encountered or it does not appear that this line is", 363: "in a table."] 364: when 20 365: # tables are allowed inside samples and platforms 366: x = ["Tables are only allowed inside SAMPLE and PLATFORM.", 367: "Current table information found inside #{extra_info}."] 368: when 30 369: x = ["Entity attribute line (\"#{LINE_TYPE_ENTITY_ATTRIBUTE}\")", 370: "found before entity indicator line (\"#{LINE_TYPE_ENTITY_INDICATOR}\")"] 371: when 40 372: x = ["Unkown entity indicator. Must be DATABASE, SAMPLE, PLATFORM,", 373: "SERIES, DATASET, or SUBSET."] 374: else 375: raise IndexError, "Unknown error message requested." 376: end 377: 378: x.join(" ") 379: end
# File lib/bio/db/soft.rb, line 272 272: def process(lines) 273: current_indicator = nil 274: current_class_accessor = nil 275: in_table = false 276: 277: lines.each_with_index do |line, line_number| 278: line.strip! 279: next if line.nil? or line.empty? 280: case line[0].chr 281: when LINE_TYPE_ENTITY_INDICATOR 282: current_indicator, value = split_label_value_in( line[1..-1] ) 283: 284: case current_indicator 285: when 'DATABASE' 286: current_class_accessor = @database 287: when 'DATASET' 288: current_class_accessor = @dataset 289: when 'PLATFORM' 290: current_class_accessor = @platform 291: when 'SERIES' 292: current_class_accessor = @series 293: when 'SAMPLE' 294: @samples[value] = Sample.new 295: current_class_accessor = @samples[value] 296: when 'SUBSET' 297: @subsets[value] = Subset.new 298: current_class_accessor = @subsets[value] 299: else 300: custom_raise( line_number, error_msg(40, line) ) 301: end 302: 303: when LINE_TYPE_ENTITY_ATTRIBUTE 304: if( current_indicator == nil ) 305: custom_raise( line_number, error_msg(30) ) 306: end 307: 308: # Handle lines such as '!platform_table_begin' and '!platform_table_end' 309: if in_table 310: if line =~ %r{table_begin} 311: next 312: elsif line =~ %r{table_end} 313: in_table = false 314: next 315: end 316: end 317: 318: key, value = split_label_value_in( line, true ) 319: key_s = key.to_sym 320: 321: if current_class_accessor.include?( key_s ) 322: if current_class_accessor[ key_s ].class != Array 323: current_class_accessor[ key_s ] = [ current_class_accessor[ key_s ] ] 324: end 325: current_class_accessor[key.to_sym] << value 326: else 327: current_class_accessor[key.to_sym] = value 328: end 329: 330: when LINE_TYPE_TABLE_HEADER 331: if( (current_indicator != 'SAMPLE') and (current_indicator != 'PLATFORM') and (current_indicator != 'DATASET') ) 332: custom_raise( line_number, error_msg(20, current_indicator.inspect) ) 333: end 334: 335: in_table = true # may be redundant, computationally not worth checking 336: 337: # We only expect one table per platform or sample 338: current_class_accessor[:table] ||= Table.new 339: key, value = split_label_value_in( line ) 340: # key[1..-1] -- Remove first character which is the LINE_TYPE_TABLE_HEADER 341: current_class_accessor[:table].header_description[ key[1..-1] ] = value 342: 343: else 344: # Type: No line type - should be a row in a table. 345: 346: if( (current_indicator == nil) or (in_table == false) ) 347: custom_raise( line_number, error_msg(10) ) 348: end 349: current_class_accessor[:table].add_header_or_row( line ) 350: end 351: end 352: end
# File lib/bio/db/soft.rb, line 386 386: def split_label_value_in( line, shift_key=false ) 387: line =~ %r{\s*=\s*} 388: key, value = $`, $' 389: 390: if shift_key 391: key =~ %r{_} 392: key = $' 393: end 394: 395: if( (key == nil) or (value == nil) ) 396: puts line.inspect 397: raise 398: end 399: 400: [key, value] 401: end