class JobParser::Parser

Constants

ACCEPTED_ELEMENTS

Public Class Methods

new(html, from_url) click to toggle source
# File lib/jobparser/parser.rb, line 6
def initialize(html, from_url)
  @url = from_url
  @html = html
  @doc = strip_bad_elements(Nokogiri::HTML(@html))
  @plain_text = get_plain_text
end

Public Instance Methods

job() click to toggle source
# File lib/jobparser/parser.rb, line 13
def job
  if JobParser.cache.valid_for_url?(@url)
    JobParser.cache.fetch_result_for_url(@url)
  else
    { :url => @url,
      :salary => job_salary,
      :title => job_title,
      :apply => apply_link,
      :salary_string => job_salary_string,
      :location => job_location,
      :deadline => deadline,
      :postcode => job_postcode
    }
  end
end

Private Instance Methods

cache(result) click to toggle source
# File lib/jobparser/parser.rb, line 35
def cache(result)
  if JobParser.config[:cache_on]
    store_result_to_cache(result)
  end
  result
end
facet_args() click to toggle source
# File lib/jobparser/parser.rb, line 31
def facet_args
  [@doc, @url, @plain_text]
end
get_plain_text() click to toggle source
# File lib/jobparser/parser.rb, line 55
def get_plain_text
  doc = @doc.dup
  blacklist = ['title', 'script', 'style', 'button']
  nodelist = doc.search('//text()')
  nodelist.text
end
store_result_to_cache(result) click to toggle source
# File lib/jobparser/parser.rb, line 42
def store_result_to_cache(result)
  JobParser.cache.store_to_file(result)
end
strip_bad_elements(doc) click to toggle source
# File lib/jobparser/parser.rb, line 46
def strip_bad_elements(doc)
  blacklist = ['script', 'style', 'button']
  blacklist.each do |tag|
    doc.xpath("//#{tag}").remove
  end
  doc.css("br").each { |br| br.replace "\n" }
  doc
end