class MS::Sequest::Srf
Constants
- Dta
total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1) unknown is, well unknown…
- Out
MS::Sequest::Srf::Out
= Struct.new( *%w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count).map(&:to_sym) )
Attributes
the base name of the file with no extension
the base name of the file with no extension
the base name of the file with no extension
a boolean to indicate if the results have been filtered by the sequest.params precursor mass tolerance
the directory the srf file was residing in when the filename was passed in. May not be available.
a String: 3.5, 3.3 or 3.2
Public Class Methods
returns a Sequest::Params
object or nil if none
# File lib/ms/sequest/srf.rb, line 53 def self.get_sequest_params_and_finish_pos(filename) # split the file in half and only read the second half (since we can be # confident that the params file will be there!) params = nil finish_parsing_io_pos = nil File.open(filename, 'rb') do |handle| halfway = handle.stat.size / 2 handle.seek halfway last_half = handle.read if sequest_start_from_last_half = last_half.rindex('[SEQUEST]') params_start_index = sequest_start_from_last_half + halfway handle.seek(params_start_index) params = MS::Sequest::Params.new.parse_io(handle) finish_parsing_io_pos = handle.pos else nil # not found end end [params, finish_parsing_io_pos] end
opts:
:filter_by_precursor_mass_tolerance => true | false (default true) this will filter by the sequest params prec tolerance as is typically done by the Bioworks software. :read_pephits => true | false (default true) will attempt to read peptide hit information (equivalent to .out files), otherwise, just reads the dta information.
# File lib/ms/sequest/srf.rb, line 92 def initialize(filename=nil, opts={}) @peptide_hits = [] @dta_files = [] @out_files = [] if filename from_file(filename, opts) end end
Public Instance Methods
# File lib/ms/sequest/srf.rb, line 75 def dta_start_byte case @version when '3.2' ; 3260 when '3.3' ; 3644 when '3.5' ; 3644 end end
-
updates the out_file’s list of hits based on passing peptide_hits (but not
the original hit id; rank is implicit in array ordering)
-
recalculates deltacn values completely if number of hits changed (does
not touch deltacn orig)
This can spoil proper protein -> peptide linkages. MS::Id::Search.merge! should be run after this method to ensure correct protein -> peptide linkages.
# File lib/ms/sequest/srf.rb, line 110 def filter_by_precursor_mass_tolerance! pmt = params.peptide_mass_tolerance.to_f methd = nil # the method to case params.peptide_mass_units when '0' amu_based = true milli_amu = false when '1' amu_based = true milli_amu = true when '2' amu_based = false end self.filtered_by_precursor_mass_tolerance = true self.out_files.each do |out_file| hits = out_file.hits before = hits.size hits.reject! do |pep| if amu_based if milli_amu (pep.deltamass.abs > (pmt/1000)) else (pep.deltamass.abs > pmt) end else (pep.ppm.abs > pmt) end end if hits.size != before out_file.hits = hits # <- is this necessary MS::Sequest::Srf::Out::Peptide.update_deltacns_from_xcorr(hits) out_file.num_hits = hits.size end end self end
returns self opts are the same as for ‘new’
# File lib/ms/sequest/srf.rb, line 166 def from_file(filename, opts) @resident_dir = File.dirname(File.expand_path(filename)) opts = { :filter_by_precursor_mass_tolerance => true, :read_pephits => true}.merge(opts) (@params, after_params_io_pos) = MS::Sequest::Srf.get_sequest_params_and_finish_pos(filename) return unless @params dup_references = 0 dup_refs_gt_0 = false dup_references = @params.print_duplicate_references.to_i if dup_references == 0 # warn %Q{ #***************************************************************************** #WARNING: This srf file lists only 1 protein per peptide! (based on the #print_duplicate_references parameter in the sequest.params file used in its #creation) So, downstream output will likewise only contain a single protein #for each peptide hit. In many instances this is OK since downstream programs #will recalculate protein-to-peptide linkages from the database file anyway. #For complete protein lists per peptide hit, .srf files must be created with #print_duplicate_references > 0. HINT: to capture all duplicate references, #set the sequest parameter 'print_duplicate_references' to 100 or greater. #***************************************************************************** # } else dup_refs_gt_0 = true end File.open(filename, 'rb') do |fh| @header = MS::Sequest::Srf::Header.from_io(fh) @version = @header.version unpack_35 = case @version when '3.2' false when '3.3' false when '3.5' true end if @header.combined @base_name = File.basename(filename, '.*') # I'm not sure why this is the case, but the reported number is too # big by one on the 2 files I've seen so far, so we will correct it here! @header.dta_gen.num_dta_files = @header.dta_gen.num_dta_files - 1 if opts[:read_pephits] == false raise NotImplementedError, "on combined files must read everything right now!" end (@dta_files, @out_files) = read_dta_and_out_interleaved(fh, @header.num_dta_files, unpack_35, dup_refs_gt_0) else @base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first @dta_files = read_dta_files(fh, @header.num_dta_files, unpack_35) if opts[:read_pephits] # need the params file to know if the duplicate_references is set > 0 raise NoSequestParamsError, "no sequest params info in srf file!\npass in path to sequest.params file" if @params.nil? @out_files = read_out_files(fh,@header.num_dta_files, unpack_35, dup_refs_gt_0) # FOR DISPLAY ONLY! #@out_files.each do |f| # if f.num_hits == 10 # p f.hits.last # end #end if fh.eof? #warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..." @params = nil @index = [] end end end fh.pos = after_params_io_pos # This is very sensitive to the grab_params method in sequest params fh.read(12) ## gap between last params entry and index @index = read_scan_index(fh,@header.num_dta_files) end ### UPDATE SOME THINGS: # give each hit a base_name, first_scan, last_scan if opts[:read_pephits] && !@header.combined @index.each_with_index do |ind,i| mass_measured = @dta_files[i][0] outfile = @out_files[i] outfile.first_scan = ind[0] outfile.last_scan = ind[1] outfile.charge = ind[2] pep_hits = @out_files[i].hits @peptide_hits.push( *pep_hits ) pep_hits.each do |pep_hit| pep_hit[15] = @base_name pep_hit[16] = ind[0] pep_hit[17] = ind[1] pep_hit[18] = ind[2] # add the deltamass pep_hit[12] = pep_hit[0] - mass_measured # real - measured (deltamass) pep_hit[13] = 1.0e6 * pep_hit[12].abs / mass_measured ## ppm pep_hit[19] = self ## link with the srf object end end filter_by_precursor_mass_tolerance! if params end self end
# File lib/ms/sequest/srf.rb, line 48 def protein_class MS::Sequest::Srf::Out::Protein end
# File lib/ms/sequest/srf.rb, line 149 def read_dta_and_out_interleaved(fh, num_files, unpack_35, dup_refs_gt_0) dta_files = Array.new(num_files) out_files = Array.new(num_files) start = dta_start_byte fh.pos = start num_files.times do |i| dta_files[i] = MS::Sequest::Srf::Dta.from_io(fh, unpack_35) #p dta_files[i] out_files[i] = MS::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0) #p out_files[i] end [dta_files, out_files] end
returns an array of dta_files
# File lib/ms/sequest/srf.rb, line 301 def read_dta_files(fh, num_files, unpack_35) dta_files = Array.new(num_files) start = dta_start_byte fh.pos = start header.num_dta_files.times do |i| dta_files[i] = MS::Sequest::Srf::Dta.from_io(fh, unpack_35) end dta_files end
filehandle (fh) must be at the start of the outfiles. ‘read_dta_files’ will put the fh there.
# File lib/ms/sequest/srf.rb, line 314 def read_out_files(fh,number_files, unpack_35, dup_refs_gt_0) out_files = Array.new(number_files) header.num_dta_files.times do |i| out_files[i] = MS::Sequest::Srf::Out.from_io(fh, unpack_35, dup_refs_gt_0) end out_files end
returns an index where each entry is [first_scan, last_scan, charge]
# File lib/ms/sequest/srf.rb, line 280 def read_scan_index(fh, num) #string = fh.read(80) #puts "STRING: " #p string #puts string #File.open("tmp.tmp",'wb') {|out| out.print string } #abort 'her' ind_len = 24 index = Array.new(num) unpack_string = 'III' st = '' ind_len.times do st << '0' end ## create a 24 byte string to receive data num.times do |i| fh.read(ind_len, st) result = st.unpack(unpack_string) index[i] = st.unpack(unpack_string) end index end