module TorchText::Data::Utils
Constants
- PATTERNS_DICT
Public Instance Methods
ngrams_iterator(token_list, ngrams) { |x| ... }
click to toggle source
# File lib/torchtext/data/utils.rb, line 17 def ngrams_iterator(token_list, ngrams) return enum_for(:ngrams_iterator, token_list, ngrams) unless block_given? get_ngrams = lambda do |n| (token_list.size - n + 1).times.map { |i| token_list[i...(i + n)] } end token_list.each do |x| yield x end 2.upto(ngrams) do |n| get_ngrams.call(n).each do |x| yield x.join(" ") end end end
tokenizer(tokenizer, language: "en")
click to toggle source
# File lib/torchtext/data/utils.rb, line 4 def tokenizer(tokenizer, language: "en") return method(:split_tokenizer) if tokenizer.nil? if tokenizer == "basic_english" if language != "en" raise ArgumentError, "Basic normalization is only available for English(en)" end return method(:basic_english_normalize) end raise "Not implemented yet" end
Private Instance Methods
basic_english_normalize(line)
click to toggle source
# File lib/torchtext/data/utils.rb, line 45 def basic_english_normalize(line) line = line.downcase PATTERNS_DICT.each do |pattern_re, replaced_str| line.sub!(pattern_re, replaced_str) end line.split end
split_tokenizer(x)
click to toggle source
# File lib/torchtext/data/utils.rb, line 36 def split_tokenizer(x) x.split end