module RbLibText

Constants

VERSION

Public Class Methods

or_pattern() click to toggle source
# File lib/rb_lib_text.rb, line 5
def self.or_pattern
  patterns = {
    html_chars: '&\w+;',  # separates the junk that comes from > and < and &
    numbers_commas: '[\-\$]?\d{1,3}(?:,\d{3})+', # like 2,000,000
    times: '\d?\d:\d{2}',  # like 2:12
    money: '-?\$?\d+[.]\d+%?', #Catch money numerics
    acronyms: '(?:\w{1}\.{1})+',  # like U.T.
    possessive_mentions: '@\w+', #splits possessive off of @jimbob's
    possessive_hashtags: '#\w+', #splits possessive off of #tcot's
    tags_contractions: '[\w]+[\'‘’][\w]+', #don't split don't and can't and it's
    emails: '[\w\.\d]+@[\w\.\d]+\.[\w]+', #catch email addresses
    urls: 'https?://[-_/~%\w\d\.]*[_/~\w\d]', #Catch url addresses
    # sideways_text_emoji: '>?[:;=][\'\-D\)\]\(\[pPdoO/\*3\\]+',
    sideways_text_emoji: '>?[:;=8][\'\-D\)\(3DdPpOo\*\/]+',
    ellipses: '\.{3}',
    en_em_dash: '-{2,3}', #Catch en and em dashes
    slashes: '[\w]+(?:[/\-][\w]+)+', #Grammatical / -
    punct: '[\"“”‘’\'\\.\\?!…,:;»«\(\)]', #punctuation to split on
    tags_mentions: '[\w#@\d%$\u00B0]+', #Group all of these things together
    hearts: '<+\/?3', # <3
    emoji_block0: '[\U00002600-\U000027BF]',
    emoji_block1: '[\U0001f300-\U0001f64F]',
    emoji_block2: '[\U0001f680-\U0001f6FF]',
    other_punct: '[\u2014\u2013]',
    all_other: '[^\s]', #Split any other weird chars that may have been missed
  }

  return Regexp.union(patterns.values.map{|value| Regexp.new(value)})
end
tokens(text) click to toggle source
# File lib/rb_lib_text.rb, line 35
def self.tokens(text)
  text = text.gsub("\u2026", "...")
  text = text.gsub(/\.{2,}/, "...")
  return text.scan(self.or_pattern)
end