class Bio::Spoctopus::Result

Public Class Methods

create_from_output(spoctopus_output) click to toggle source

Given the fasta-ish file output from spoctopus, parse it into a SignalPeptideTransmembraneDomainProtein.

Example without TMD: >wrapperSeq gggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg ggggggggggggggggggggggggggggggggggggg

Example with 2 TMD >wrapperSeq iiiiiiiiiiMMMMMMMMMMMMMMMMMMMMMooooooooooooooooooooooooooooo ooooMMMMMMMMMMMMMMMMMMMMMiiiiiMMMMMMMMMMMMMMMMMMMMMo

Example with SP and TMD >wrapperSeq nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnSSSSSSSSSSSSSSSooooooooooooooo ooooooooooooooooooooooooooooooooooooo

# File lib/bio/appl/octopus.rb, line 114
def self.create_from_output(spoctopus_output)
  #        puts spoctopus_output
  # split the fasta into the real parts
  lines = spoctopus_output.split("\n")
  
  # Error checking
  unless lines[0].match(/^\>/) and lines.length > 1
    raise Exception, "Unexpected OCTOPUS output file: #{spoctopus_output.inspect}. STDERR: #{File.open(err.path).read}"
  end

  seq = lines[1..(lines.length-1)].join('')

  # Taken from http://octopus.cbr.su.se/OCTOPUS_DATA/readme
  # and supplemented by experiment, as there doesn't seem to be one available for
  # SPOCTOPUS, only OCTOPUS.
  #
  # Currently dips, hairpins, unannotated and reentrants are ignored.
  unless seq.match(/^[ioMgnSHRrDd\.T]+$/)
    raise Exception, "Unexpected characters in SPOCTOPUS output sequence: #{seq}"
  end

  tmd = Bio::Transmembrane::SignalPeptideTransmembraneDomainProtein.new

  # deal with nothing proteins
  return tmd if seq.match(/^g*$/)

  seq.scan(/S+/) do
    if tmd.signal?
      raise Exception, "Only 1 Signal Peptide is expected!. SPOCTOPUS output was #{seq}"
    end

    s = Bio::Transmembrane::SignalPeptide.new
    s.start = $~.offset(0)[0]+1
    s.stop = $~.offset(0)[1]
    tmd.signal_peptide = s
  end

  seq.scan(/M+/) do # for each transmembrane domain
    t = Bio::Transmembrane::OrientedTransmembraneDomain.new
    t.start = $~.offset(0)[0]+1
    t.stop = $~.offset(0)[1]

    # set orientation
    # if at the start of the protein it is harder
    if t.start == 1
      if t.stop == seq.length #all TMD, so we don't know
        t.orientation = Bio::Transmembrane::OrientedTransmembraneDomain::UNKNOWN
      else
        char = seq[t.stop-2..t.stop-2]
        if char == 'o'
          t.orientation = Bio::Transmembrane::OrientedTransmembraneDomain::INSIDE_OUT
        else
          t.orientation = Bio::Transmembrane::OrientedTransmembraneDomain::OUTSIDE_IN
        end
      end

    else # usual - TMD does not start at exactly the beginning
      char = seq[t.start-2..t.start-2]
      if char == 'i'
        t.orientation = Bio::Transmembrane::OrientedTransmembraneDomain::INSIDE_OUT
      else
        t.orientation = Bio::Transmembrane::OrientedTransmembraneDomain::OUTSIDE_IN
      end
    end

    tmd.transmembrane_domains.push t
  end

  return tmd
end