class JobParser::Facets::Title
Public Class Methods
new(*args)
click to toggle source
Calls superclass method
JobParser::Facets::Facet::new
# File lib/jobparser/facets/title.rb, line 4 def initialize(*args) super(*args) @scorer = Scorer.new end
Public Instance Methods
parse()
click to toggle source
# File lib/jobparser/facets/title.rb, line 9 def parse special_case_result = use_special_case(:title) return special_case_result unless special_case_result.nil? score_page_title # first see if we find something with a matching id loop_over_elements do |name, elem| next if elem_not_suitable_as_title?(elem) content = Cleaner.strip_all_white_space(elem.content) # does the element have an id that means it might store the title? elem_has_job_title_id_score(elem, content) # or if a heading element matches the page title elem_heading_matches_page_title(elem, content) # if it has some common words that feature in job titles elem_matches_job_title_words(content) # if it's the title, get the content of the next element elem_is_vacancy_title(elem, content) end clean_title(@scorer.top_match.strip.gsub(NBSP, "")) end
Private Instance Methods
clean_title(title)
click to toggle source
# File lib/jobparser/facets/title.rb, line 80 def clean_title(title) Cleaner.strip_all_white_space(title.split(":").last) end
elem_has_job_title_id_score(elem, content)
click to toggle source
# File lib/jobparser/facets/title.rb, line 60 def elem_has_job_title_id_score(elem, content) @scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s) end
elem_heading_matches_page_title(elem, content)
click to toggle source
# File lib/jobparser/facets/title.rb, line 54 def elem_heading_matches_page_title(elem, content) if elem_is_heading?(elem.name) @scorer.store(content, 40).if_block_true { page_title.include?(content) } end end
elem_is_heading?(name)
click to toggle source
# File lib/jobparser/facets/title.rb, line 76 def elem_is_heading?(name) %w{h1 h2 h3 h4 h5}.include?(name) end
elem_is_vacancy_title(elem, content)
click to toggle source
# File lib/jobparser/facets/title.rb, line 39 def elem_is_vacancy_title(elem, content) VACANCY_TITLE_REGEX.match(content) { if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty? next_content = Cleaner.strip_all_white_space(elem.next_element.content) @scorer.store(next_content, 30).if_block_true { ACCEPTED_ELEMENTS.include?(elem.next_element.name) } end } end
elem_matches_job_title_words(content)
click to toggle source
# File lib/jobparser/facets/title.rb, line 50 def elem_matches_job_title_words(content) @scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content) end
elem_not_suitable_as_title?(elem)
click to toggle source
# File lib/jobparser/facets/title.rb, line 72 def elem_not_suitable_as_title?(elem) elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty? end
page_title()
click to toggle source
# File lib/jobparser/facets/title.rb, line 64 def page_title @doc.at_css("title").content end
score_page_title()
click to toggle source
# File lib/jobparser/facets/title.rb, line 68 def score_page_title @scorer.store_and_score(page_title, 10) end