class Quesadilla::Extractor

Extract entities from text

Constants

REPLACE_TOKEN

Invisible character from the reserved range replaces markdown we’ve already parsed.

Public Class Methods

default_options() click to toggle source

@return [Hash] default extractor options

# File lib/quesadilla/extractor.rb, line 17
def self.default_options
  {
    markdown: true,
    markdown_code: true,
    markdown_links: true,
    markdown_triple_emphasis: true,
    markdown_double_emphasis: true,
    markdown_emphasis: true,
    markdown_strikethrough: true,
    hashtags: true,
    hashtag_validator: nil,
    autolinks: true,
    emoji: true,
    users: false,
    user_validator: nil,
    html: true,
    html_renderer: Quesadilla::HTMLRenderer
  }
end
new(options = {}) click to toggle source

@param options [Hash] an optional options hash. Defaults to ‘Quesadilla::Extractor.default_options`. @option options [Boolean] Should extract Markdown. Defaults to `true`. @option options markdown_code [Boolean] Should extract Markdown code. Defaults to `true`. @option options markdown_links [Boolean] Should extract Markdown links. Defaults to `true`. @option options markdown_triple_emphasis [Boolean] Should extract Markdown triple emphasis (bold italic). Defaults to `true`. @option options markdown_double_emphasis [Boolean] Should extract Markdown double emphasis (bold). Defaults to `true`. @option options markdown_emphasis [Boolean] Should extract Markdown emphasis (italic). Defaults to `true`. @option options markdown_strikethrough [Boolean] Should extract Markdown strikethrough. Defaults to `true`. @option options hashtags [Boolean] Should extract hashtags. Defaults to `true`. @option options hashtag_validator A callable object to validate a hashtag. This should return `true` or `false`. Invalid hashtags will be left as plain text. If the validator is `nil`, all hashtags will be extracted. Defaults to `nil`. @option options autolinks [Boolean] Should automatically detect links. Defaults to `true`. @option options emoji [Boolean] Should extract named emoji. Defaults to `true`. @option options users [Boolean] Should extract user mentions. Defaults to `false`. @option options user_validator A callable object to validate a username. This should return the user ID of the user or nil if it is invalid. Invalid users will be left as plain text. If the validator is `nil`, all usernames will be extracted. Defaults to `nil`. @option options html [Boolean] Should generate HTML. Defaults to `true`. @option options html_renderer [Class] class to use as HTML renderer. Defaults to `Quesadilla::HTMLRenderer`.

# File lib/quesadilla/extractor.rb, line 53
def initialize(options = {})
  @options = self.class.default_options.merge(options)
  @renderer = @options[:html_renderer].new if @options[:html]
end

Public Instance Methods

extract(text) click to toggle source

Extract entities from text @param text [String] the text to extract from @return [Hash] hash containing the display text, html text, and entities

# File lib/quesadilla/extractor.rb, line 61
def extract(text)
  @original_text = text.dup

  # Emoji colon-syntax
  replace_emoji if @options[:emoji]

  @working_text = @original_text.dup
  @entities = []

  # Get entities
  extract_markdown if @options[:markdown]
  extract_hashtags if @options[:hashtags]
  extract_autolinks if @options[:autolinks]
  extract_users if @options[:users]

  # Sort entities
  @entities.sort! do |a, b|
    a[:indices].first <=> b[:indices].first
  end

  # Adjust display for each entity
  display_text = sub_entities(@original_text, @entities)

  # Return
  hash = {
    display_text: display_text,
    entities: @entities
  }
  hash[:display_html] = display_html(display_text, @entities) if @options[:html]
  hash
end

Private Instance Methods

display_url(url) click to toggle source
# File lib/quesadilla/extractor.rb, line 98
def display_url(url)
  url = url.gsub(/(?:https?:\/\/)?(?:www\.)?/i, '').q_truncate(32, omission: '…')
  url = url[0...(url.length - 1)] if url[-1, 1] == '/'
  url
end
quality_url(url) click to toggle source
# File lib/quesadilla/extractor.rb, line 104
def quality_url(url)
  return url if url.include?('://')
  'http://' + url
end
sub_entities(input_text, entities, display = false) { |entity| ... } click to toggle source
# File lib/quesadilla/extractor.rb, line 109
def sub_entities(input_text, entities, display = false, &block)
  # Adjust output text for each entity
  output_text = input_text
  offset = 0
  entities.each do |entity|
    entity_original_text = display ? entity[:display_text] : entity[:text]
    entity_display_text = if block_given?
      yield(entity)
    else
      entity[:display_text]
    end

    indices = display ? entity[:display_indices] : entity[:indices]

    # Use the entity's display text instead of original text if they're different
    unless entity_original_text == entity_display_text
      # Get the fragment before the entity
      bf_end = indices[0] - 1 - offset
      before_frag = bf_end <= 0 ? '' : output_text[0..bf_end]

      # Get the fragment after the entity
      af_start = indices[1] - offset
      af_end = output_text.length - 1
      after_frag = af_start > af_end ? '' : output_text[af_start..af_end]

      # Update the output text
      output_text = before_frag + entity_display_text + after_frag
    end

    # Update offset
    adjust = entity_original_text.length - entity_display_text.length
    unless display
      entity[:display_indices] = [entity[:indices][0] - offset, entity[:indices][1] - offset - adjust]
    end
    offset += adjust
  end
  output_text
end