class JobParser::Parser
Constants
- ACCEPTED_ELEMENTS
Public Class Methods
new(html, from_url)
click to toggle source
# File lib/jobparser/parser.rb, line 6 def initialize(html, from_url) @url = from_url @html = html @doc = strip_bad_elements(Nokogiri::HTML(@html)) @plain_text = get_plain_text end
Public Instance Methods
job()
click to toggle source
# File lib/jobparser/parser.rb, line 13 def job if JobParser.cache.valid_for_url?(@url) JobParser.cache.fetch_result_for_url(@url) else { :url => @url, :salary => job_salary, :title => job_title, :apply => apply_link, :salary_string => job_salary_string, :location => job_location, :deadline => deadline, :postcode => job_postcode } end end
Private Instance Methods
cache(result)
click to toggle source
# File lib/jobparser/parser.rb, line 35 def cache(result) if JobParser.config[:cache_on] store_result_to_cache(result) end result end
facet_args()
click to toggle source
# File lib/jobparser/parser.rb, line 31 def facet_args [@doc, @url, @plain_text] end
get_plain_text()
click to toggle source
# File lib/jobparser/parser.rb, line 55 def get_plain_text doc = @doc.dup blacklist = ['title', 'script', 'style', 'button'] nodelist = doc.search('//text()') nodelist.text end
store_result_to_cache(result)
click to toggle source
# File lib/jobparser/parser.rb, line 42 def store_result_to_cache(result) JobParser.cache.store_to_file(result) end
strip_bad_elements(doc)
click to toggle source
# File lib/jobparser/parser.rb, line 46 def strip_bad_elements(doc) blacklist = ['script', 'style', 'button'] blacklist.each do |tag| doc.xpath("//#{tag}").remove end doc.css("br").each { |br| br.replace "\n" } doc end