class JobParser::Facets::Title

Public Class Methods

new(*args) click to toggle source
Calls superclass method JobParser::Facets::Facet::new
# File lib/jobparser/facets/title.rb, line 4
def initialize(*args)
  super(*args)
  @scorer = Scorer.new
end

Public Instance Methods

parse() click to toggle source
# File lib/jobparser/facets/title.rb, line 9
def parse
  special_case_result = use_special_case(:title)
  return special_case_result unless special_case_result.nil?

  score_page_title

  # first see if we find something with a matching id
  loop_over_elements do |name, elem|
    next if elem_not_suitable_as_title?(elem)

    content = Cleaner.strip_all_white_space(elem.content)

    # does the element have an id that means it might store the title?
    elem_has_job_title_id_score(elem, content)

    # or if a heading element matches the page title
    elem_heading_matches_page_title(elem, content)

    # if it has some common words that feature in job titles
    elem_matches_job_title_words(content)

    # if it's the title, get the content of the next element
    elem_is_vacancy_title(elem, content)
  end

  clean_title(@scorer.top_match.strip.gsub(NBSP, ""))
end

Private Instance Methods

clean_title(title) click to toggle source
# File lib/jobparser/facets/title.rb, line 80
def clean_title(title)
  Cleaner.strip_all_white_space(title.split(":").last)
end
elem_has_job_title_id_score(elem, content) click to toggle source
# File lib/jobparser/facets/title.rb, line 60
def elem_has_job_title_id_score(elem, content)
  @scorer.store(content, 60).if_regex_match(JOB_TITLE_ID_REGEX, elem.attribute("id").to_s)
end
elem_heading_matches_page_title(elem, content) click to toggle source
# File lib/jobparser/facets/title.rb, line 54
def elem_heading_matches_page_title(elem, content)
  if elem_is_heading?(elem.name)
    @scorer.store(content, 40).if_block_true { page_title.include?(content) }
  end
end
elem_is_heading?(name) click to toggle source
# File lib/jobparser/facets/title.rb, line 76
def elem_is_heading?(name)
  %w{h1 h2 h3 h4 h5}.include?(name)
end
elem_is_vacancy_title(elem, content) click to toggle source
# File lib/jobparser/facets/title.rb, line 39
def elem_is_vacancy_title(elem, content)
  VACANCY_TITLE_REGEX.match(content) {
    if elem.next_element && !Cleaner.strip_all_white_space(elem.next_element.content).empty?
      next_content = Cleaner.strip_all_white_space(elem.next_element.content)
      @scorer.store(next_content, 30).if_block_true {
        ACCEPTED_ELEMENTS.include?(elem.next_element.name)
      }
    end
  }
end
elem_matches_job_title_words(content) click to toggle source
# File lib/jobparser/facets/title.rb, line 50
def elem_matches_job_title_words(content)
  @scorer.store(content, 20).if_regex_match(JOB_TITLE_WORDS, content)
end
elem_not_suitable_as_title?(elem) click to toggle source
# File lib/jobparser/facets/title.rb, line 72
def elem_not_suitable_as_title?(elem)
  elem.content == "" || elem.content.split(" ").length > 10 || elem.content.strip.empty?
end
page_title() click to toggle source
# File lib/jobparser/facets/title.rb, line 64
def page_title
  @doc.at_css("title").content
end
score_page_title() click to toggle source
# File lib/jobparser/facets/title.rb, line 68
def score_page_title
  @scorer.store_and_score(page_title, 10)
end