Class | Bio::Lasergene |
In: |
lib/bio/db/lasergene.rb
|
Parent: | Object |
bio/db/lasergene.rb - Interface for DNAStar Lasergene sequence file format
Author: | Trevor Wennblom <trevor@corevx.com> |
Copyright: | Copyright (c) 2007 Center for Biomedical Research Informatics, University of Minnesota (cbri.umn.edu) |
License: | The Ruby License |
Bio::Lasergene reads DNAStar Lasergene formatted sequence files, or +.seq+ files. It only expects to find one sequence per file.
require 'bio' filename = 'MyFile.seq' lseq = Bio::Lasergene.new( IO.readlines(filename) ) lseq.entry_id # => "Contig 1" lseq.seq # => ATGACGTATCCAAAGAGGCGTTACC
I‘m only aware of the following three kinds of Lasergene file formats. Feel free to send me other examples that may not currently be accounted for.
File format 1:
## begin ## "Contig 1" (1,934) Contig Length: 934 bases Average Length/Sequence: 467 bases Total Sequence Length: 1869 bases Top Strand: 2 sequences Bottom Strand: 2 sequences Total: 4 sequences ^^ ATGACGTATCCAAAGAGGCGTTACCGGAGAAGAAGACACCGCCCCCGCAGTCCTCTTGGCCAGATCCTCCGCCGCCGCCCCTGGCTCGTCCACCCCCGCCACAGTTACCGCTGGAGAAGGAAAAATGGCATCTTCAWCACCCGCCTATCCCGCAYCTTCGGAWRTACTATCAAGCGAACCACAGTCAGAACGCCCTCCTGGGCGGTGGACATGATGAGATTCAATATTAATGACTTTCTTCCCCCAGGAGGGGGCTCAAACCCCCGCTCTGTGCCCTTTGAATACTACAGAATAAGAAAGGTTAAGGTTGAATTCTGGCCCTGCTCCCCGATCACCCAGGGTGACAGGGGAATGGGCTCCAGTGCTGWTATTCTAGMTGATRRCTTKGTAACAAAGRCCACAGCCCTCACCTATGACCCCTATGTAAACTTCTCCTCCCGCCATACCATAACCCAGCCCTTCTCCTACCRCTCCCGYTACTTTACCCCCAAACCTGTCCTWGATKCCACTATKGATKACTKCCAACCAAACAACAAAAGAAACCAGCTGTGGSTGAGACTACAWACTGCTGGAAATGTAGACCWCGTAGGCCTSGGCACTGCGTKCGAAAACAGTATATACGACCAGGAATACAATATCCGTGTMACCATGTATGTACAATTCAGAGAATTTAATCTTAAAGACCCCCCRCTTMACCCKTAATGAATAATAAMAACCATTACGAAGTGATAAAAWAGWCTCAGTAATTTATTYCATATGGAAATTCWSGGCATGGGGGGGAAAGGGTGACGAACKKGCCCCCTTCCTCCSTSGMYTKTTCYGTAGCATTCYTCCAMAAYACCWAGGCAGYAMTCCTCCSATCAAGAGcYTSYACAGCTGGGACAGCAGTTGAGGAGGACCATTCAAAGGGGGTCGGATTGCTGGTAATCAGA ## end ##
File format 2:
## begin ## ^^: 350,935 Contig 1 (1,935) Contig Length: 935 bases Average Length/Sequence: 580 bases Total Sequence Length: 2323 bases Top Strand: 2 sequences Bottom Strand: 2 sequences Total: 4 sequences ^^ ATGTCGGGGAAATGCTTGACCGCGGGCTACTGCTCATCATTGCTTTCTTTGTGGTATATCGTGCCGTTCTGTTTTGCTGTGCTCGTCAACGCCAGCGGCGACAGCAGCTCTCATTTTCAGTCGATTTATAACTTGACGTTATGTGAGCTGAATGGCACGAACTGGCTGGCAGACAACTTTAACTGGGCTGTGGAGACTTTTGTCATCTTCCCCGTGTTGACTCACATTGTTTCCTATGGTGCACTCACTACCAGTCATTTTCTTGACACAGTTGGTCTAGTTACTGTGTCTACCGCCGGGTTTTATCACGGGCGGTACGTCTTGAGTAGCATCTACGCGGTCTGTGCTCTGGCTGCGTTGATTTGCTTCGCCATCAGGTTTGCGAAGAACTGCATGTCCTGGCGCTACTCTTGCACTAGATACACCAACTTCCTCCTGGACACCAAGGGCAGACTCTATCGTTGGCGGTCGCCTGTCATCATAGAGAAAGGGGGTAAGGTTGAGGTCGAAGGTCATCTGATCGATCTCAAAAGAGTTGTGCTTGATGGCTCTGTGGCGACACCTTTAACCAGAGTTTCAGCGGAACAATGGGGTCGTCCCTAGACGACTTTTGCCATGATAGTACAGCCCCACAGAAGGTGCTCTTGGCGTTTTCCATCACCTACACGCCAGTGATGATATATGCCCTAAAGGTAAGCCGCGGCCGACTTTTGGGGCTTCTGCACCTTTTGATTTTTTTGAACTGTGCCTTTACTTTCGGGTACATGACATTCGTGCACTTTCGGAGCACGAACAAGGTCGCGCTCACTATGGGAGCAGTAGTCGCACTCCTTTGGGGGGTGTACTCAGCCATAGAAACCTGGAAATTCATCACCTCCAGATGCCGTTGTGCTTGCTAGGCCGCAAGTACATTCTGGCCCCTGCCCACCACGTTG ## end ##
File format 3 (non-standard Lasergene header):
## begin ## LOCUS PRU87392 15411 bp RNA linear VRL 17-NOV-2000 DEFINITION Porcine reproductive and respiratory syndrome virus strain VR-2332, complete genome. ACCESSION U87392 AF030244 U00153 VERSION U87392.3 GI:11192298 [...cut...] 3'UTR 15261..15411 polyA_site 15409 ORIGIN ^^ atgacgtataggtgttggctctatgccttggcatttgtattgtcaggagctgtgaccattggcacagcccaaaacttgctgcacagaaacacccttctgtgatagcctccttcaggggagcttagggtttgtccctagcaccttgcttccggagttgcactgctttacggtctctccacccctttaaccatgtctgggatacttgatcggtgcacgtgtacccccaatgccagggtgtttatggcggagggccaagtctactgcacacgatgcctcagtgcacggtctctccttcccctgaacctccaagtttctgagctcggggtgctaggcctattctacaggcccgaagagccactccggtggacgttgccacgtgcattccccactgttgagtgctcccccgccggggcctgctggctttctgcaatctttccaatcgcacgaatgaccagtggaaacctgaacttccaacaaagaatggtacgggtcgcagctgagctttacagagccggccagctcacccctgcagtcttgaaggctctacaagtttatgaacggggttgccgctggtaccccattgttggacctgtccctggagtggccgttttcgccaattccctacatgtgagtgataaacctttcccgggagcaactcacgtgttgaccaacctgccgctcccgcagagacccaagcctgaagacttttgcccctttgagtgtgctatggctactgtctatgacattggtcatgacgccgtcatgtatgtggccgaaaggaaagtctcctgggcccctcgtggcggggatgaagtgaaatttgaagctgtccccggggagttgaagttgattgcgaaccggctccgcacctccttcccgccccaccacacagtggacatgtctaagttcgccttcacagcccctgggtgtggtgtttctatgcgggtcgaacgccaacacggctgccttcccgctgacactgtccctgaaggcaactgctggtggagcttgtttgacttgcttccactggaagttcagaacaaagaaattcgccatgctaaccaatttggctaccagaccaagcatggtgtctctggcaagtacctacagcggaggctgca[...cut...] ## end ##
DELIMITER_1 | = | '^\^\^:' |
DELIMITER_2 | = | '^\^\^' |
average_length | [R] |
Average length per sequence
|
bottom_strand_sequences | [R] |
Number of bottom strand sequences
|
comments | [R] | Entire header before the sequence |
contig_length | [R] |
Contig length, length of present sequence
|
name | [R] |
Name of sequence
|
sequence | [R] |
Sequence
Bio::Sequence::NA or Bio::Sequence::AA object |
top_strand_sequences | [R] |
Number of top strand sequences
|
total_length | [R] |
Length of parent sequence
|
total_sequences | [R] |
Number of sequences
|
Bio::Sequence::NA or Bio::Sequence::AA object
# File lib/bio/db/lasergene.rb, line 141 141: def seq 142: @sequence 143: end
# File lib/bio/db/lasergene.rb, line 155 155: def process(lines) 156: delimiter_1_indices = [] 157: delimiter_2_indices = [] 158: 159: # If the data from the file is passed as one big String instead of 160: # broken into an Array, convert lines to an Array 161: if lines.kind_of? String 162: lines = lines.tr("\r", '').split("\n") 163: end 164: 165: lines.each_with_index do |line, index| 166: if line.match DELIMITER_1 167: delimiter_1_indices << index 168: elsif line.match DELIMITER_2 169: delimiter_2_indices << index 170: end 171: end 172: 173: raise InputError, "More than one delimiter of type '#{DELIMITER_1}'" if delimiter_1_indices.size > 1 174: raise InputError, "More than one delimiter of type '#{DELIMITER_2}'" if delimiter_2_indices.size > 1 175: raise InputError, "No comment to data separator of type '#{DELIMITER_2}'" if delimiter_2_indices.size < 1 176: 177: if !delimiter_1_indices.empty? 178: # toss out DELIMETER_1 and anything preceding it 179: @comments = lines[ (delimiter_1_indices[0] + 1) .. (delimiter_2_indices[0] - 1) ] 180: else 181: @comments = lines[ 0 .. (delimiter_2_indices[0] - 1) ] 182: end 183: 184: @standard_comment = false 185: if @comments[0] =~ %r{(.+)\s+\(\d+,\d+\)} # if we have a standard Lasergene comment 186: @standard_comment = true 187: @name = $1 188: comments.each do |comment| 189: if comment.match('Contig Length:\s+(\d+)') 190: @contig_length = $1.to_i 191: elsif comment.match('Average Length/Sequence:\s+(\d+)') 192: @average_length = $1.to_i 193: elsif comment.match('Total Sequence Length:\s+(\d+)') 194: @total_length = $1.to_i 195: elsif comment.match('Top Strand:\s+(\d+)') 196: @top_strand_sequences = $1.to_i 197: elsif comment.match('Bottom Strand:\s+(\d+)') 198: @bottom_strand_sequences = $1.to_i 199: elsif comment.match('Total:\s+(\d+)') 200: @total_sequences = $1.to_i 201: end 202: end 203: end 204: 205: @comments = @comments.join('') 206: @sequence = Bio::Sequence.auto( lines[ (delimiter_2_indices[0] + 1) .. -1 ].join('') ) 207: end