module BioPangenome

Constants

GeneFlankingRegion
Transcript

Public Class Methods

align_gene_groups( seqs:{}, tmp_folder:"/Volumes/PanGenome/GeneRegions/201910_v2_v3/tmp", output:"../pairwise_blast_oct_2019/varieties_6A_identites", distance: 0 ) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 103
def self.align_gene_groups( seqs:{}, tmp_folder:"/Volumes/PanGenome/GeneRegions/201910_v2_v3/tmp", output:"../pairwise_blast_oct_2019/varieties_6A_identites", distance: 0 )
        out_tmp="#{tmp_folder}/out.blast"
        FileUtils.mkdir_p(tmp_folder)
        out = File.open("#{output}_#{distance}bp.tab", "w")
        out.puts [ "transcript" , "query", "subject" ,  "var_query", "var_subject", "aln_type", "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total", "Flanking"   ].join("\t")
        seqs.each_pair do |transcript, transcript_seqs|
                vars = transcript_seqs.keys
                vars_done = []
                ns = {}
                vars.each do |v1|
                        tmp =  tmp_folder  + "/" + v1 + ".fa"
                        s = transcript_seqs[v1]
                        seq = ">#{s.id}\n#{s.sequence}"
                        File.open(tmp, 'w') {|f| f.write(seq) }
                        ns[v1] = s.sequence.count('Nn')
                end
                vars.each do |v1|
                        tmp1 =  tmp_folder  + "/" + v1 + ".fa"
                        s1 = transcript_seqs[v1]
                        next unless s1.sequence.length > 0
                        vars.each do |v2|
                                next if v1 == v2
                                next if vars_done.include? v2
                                s2 = transcript_seqs[v2]
                                next unless s2.sequence.length > 0
                                tmp2 =  tmp_folder  + "/" + v2 + ".fa"
                                to_print = [transcript, s1.id , s2.id , v1,v2,"#{v1}->#{v2}"]
                                to_print << blast_pair_fast(tmp1, tmp2, out_tmp) 
                                to_print << ns[v1] 
                                to_print << ns[v2]
                                to_print << ns[v1] + ns[v2]
                                to_print << distance
                                out.puts to_print.join("\t")
                        end
                        vars_done << v1
                end
        end
        out.close
end
blast_pair_fast(path_a, path_b, out_path, program: "blastn") click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 58
def self.blast_pair_fast(path_a, path_b, out_path, program: "blastn")
        cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
        system cmd
        n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
        max_length = 0
        max_pident = 0.0
        n.each do | iter |
                iter.each do | hit |
                        hit.each do | hsp |
                                if hsp.align_len > max_length
                                        max_length = hsp.align_len
                                        max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
                                end
                        end
                end
        end
        [max_length, max_pident]
end
load_cds_sequences( varieties:[], genes:{}, prefix: "../flanking/filtered/", suffix: ".cds.fa.gz", set_id: "cds" ) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 143
def self.load_cds_sequences( varieties:[], genes:{}, prefix: "../flanking/filtered/",  suffix: ".cds.fa.gz", set_id: "cds" )
        ret = Hash.new { |h, k| h[k] = Hash.new }
        varieties.each do |variety|
                path = "#{prefix}/#{variety}#{suffix}"
                infile = open(path)
                io = Zlib::GzipReader.new(infile) 
                Bio::FlatFile.open(Bio::FastaFormat, io) do |fasta_file|
                        fasta_file.each do |entry|
                                arr = entry.definition.split(".")
                                next unless genes[arr[0]]
                                row = genes[arr[0]]
                                seq_name = GeneFlankingRegion.new(entry.definition,
                                        row["gene"], "",
                                        "", entry.definition, set_id, nil, variety )
                                seq = entry.seq
                                seq.gsub!(/N*$/, '')
                                seq.gsub!(/^N*/, '')
                                seq_name.sequence = seq
                                base_gene = seq_name.gene
                                ret[base_gene][variety] = seq_name unless ret[base_gene][variety]
                        end
                end
                io.close
        end
        ret
end
load_genes(filename, window: 0, no_windows: 0) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 182
def self.load_genes(filename, window: 0, no_windows: 0)
        genes = File.readlines(filename).map do |t|
                t.chomp!.split(".")[0]
        end
        if no_windows > 0
                puts "'loading window #{window} of #{no_windows}'"
                window_size = genes.size/no_windows
                start = window * window_size
                genes = genes[start, window_size]
        end
        genes
end
load_lines(filename) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 195
def self.load_lines(filename)
        File.readlines(filename).map do |t|
                t.chomp!.rstrip
        end
end
load_mapping_hash(varieties:[], transcripts:[], genes:[], distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1") click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 38
def self.load_mapping_hash(varieties:[],  transcripts:[], genes:[], distance: 1000, prefix: "../flanking/releasePGSBV1/",  suffix: ".RefSeqv1.1")
        ret = Hash.new { |h, k| h[k] = Hash.new }
        varieties.each do |v|
                path = "#{prefix}#{distance}bp/#{v}_#{distance}bp_#{suffix}.reg.map"
                $stderr.puts path
                File.foreach(path) do |line|
                        line.chomp!
                        arr = line.split("\t")
                        begin
                                parsed = parseSequenceName(arr[0], arr[1])
                        rescue Exception => e
                                throw "Unable to parse #{line} (#{v}) [#{e.to_s}]" 
                        end
                        next unless transcripts.include? parsed.transcript or genes.include? parsed.gene
                        ret[v][parsed.region] = parsed
                end
        end
        ret
end
load_projected_genes(transcript_mapping, genes:[]) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 170
def self.load_projected_genes(transcript_mapping, genes:[])
        projected_genes = {}
        Zlib::GzipReader.open(transcript_mapping) do |gzip|
                csv = CSV.new(gzip, headers: true)
                csv.each do |row|
                        next unless genes.include? row["gene"]
                        projected_genes[row["projected_gene"]] = row
                end
        end
        projected_genes
end
load_sequences_from_hash(coordinates:{}, prefix: "../flanking/filtered/", suffix: "RefSeqv1.1", distance: 1000, projected_genes: {}) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 78
def self.load_sequences_from_hash(coordinates:{},  prefix: "../flanking/filtered/",  suffix: "RefSeqv1.1", distance: 1000, projected_genes: {})
        ret = Hash.new { |h, k| h[k] = Hash.new }
        coordinates.each_pair do |variety, coords|

                path = "#{prefix}/#{distance}bp/#{variety}_#{distance}bp_#{suffix}.fa.gz"
                puts "Loading: #{path}"
                infile = open(path)
                io = Zlib::GzipReader.new(infile) 
                Bio::FlatFile.open(Bio::FastaFormat, io) do |fasta_file|
                        fasta_file.each do |entry|
                                next unless coords[entry.definition]
                                seq_name = coords[entry.definition]
                                seq = entry.seq
                                seq.gsub!(/N*$/, '')
                                seq.gsub!(/^N*/, '')
                                seq_name.sequence = seq
                                base_gene = projected_genes[seq_name.gene]["gene"]
                                ret[base_gene][variety] = seq_name unless ret[base_gene][variety]
                        end
                end
                io.close
        end
        ret
end
parseEITranscript(name) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 18
def self.parseEITranscript name
        arr=name.split(".")
        match = /Traes(?<chr>[[:upper:]]{3}_scaffold_[[:digit:]]*)_(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
        raise "Unable to parse: #{name}" unless match
        Transcript.new(name, arr[0],match[:chr].downcase,match[:ver],match[:count],arr[1],match[:conf], match[:count].to_i, arr[1])
end
parsePGSBTranscript(name) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 25
def self.parsePGSBTranscript name
        arr=name.split(".")
        match = /Traes(?<variety>[[:upper:]]{3})(?<chr>[[:alnum:]]{1,2})(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]

        raise "Unable to parse: #{name}" unless match
        Transcript.new(name, arr[0],match[:chr],match[:ver],match[:count],match[:variety], match[:conf],match[:count].to_i, arr[1])
end
parseSequenceName(region, name) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 32
def self.parseSequenceName region, name
        match = /(?<transcript>[[:alnum:]].+)_(?<ann>.+)_(?<flank_length>[[:digit:]]+bp)/.match name
        arr2=match[:transcript].split "."
        GeneFlankingRegion.new(match[:transcript],arr2[0],match[:ann], region, name, match[:flank_length] , nil, nil)
end
parseTranscript(name) click to toggle source
# File lib/bio-pangenome/pangenome.rb, line 12
def self.parseTranscript name
        arr=name.split(".")
        match = /TraesCS(?<chr>[[:alnum:]]{1,2})(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
        raise "Unable to parse: #{name}" unless match
        Transcript.new(name, arr[0],match[:chr],match[:ver],match[:count],arr[1],match[:conf], match[:count].to_i, arr[1])
end