class JobParser::ParseSchema

Constants

EXTRA_SCHEMA_TEXT_FIELDS

Public Class Methods

new(html, from_url) click to toggle source
Calls superclass method JobParser::Parser::new
# File lib/jobparser/parseschema.rb, line 26
def initialize(html, from_url)
  Parser::ACCEPTED_ELEMENTS.push("span")
  super(html, from_url)
end

Public Instance Methods

job() click to toggle source
Calls superclass method JobParser::Parser#job
# File lib/jobparser/parseschema.rb, line 31
def job
  res = super
  res[:schema] = true
  unless res[:from_cache]
    EXTRA_SCHEMA_TEXT_FIELDS.each do |field|
      underscore_name = underscore(field).to_sym
      result = send("job_#{underscore_name}")
      res[underscore_name] = result unless result.nil? || result.empty?
    end
  end
  cache(res)
end

Private Instance Methods

deadline() click to toggle source
# File lib/jobparser/parseschema.rb, line 88
def deadline
end
does_use_schema?() click to toggle source
# File lib/jobparser/parseschema.rb, line 91
def does_use_schema?
  @doc.css("*").any? { |elem|
    elem['itemtype'] == "http://schema.org/JobPosting"
  }
end
find_with_itemprop(prop) click to toggle source
# File lib/jobparser/parseschema.rb, line 107
def find_with_itemprop(prop)
  @doc.css("*").select { |elem|
    elem['itemprop'] == prop
  }.first
end
get_content_at_prop(prop) click to toggle source
# File lib/jobparser/parseschema.rb, line 97
def get_content_at_prop(prop)
  elem = find_with_itemprop(prop)
  elem ? Cleaner.strip_all_white_space(elem.content) : ""
end
is_content_at_prop?(prop) click to toggle source
# File lib/jobparser/parseschema.rb, line 102
def is_content_at_prop?(prop)
  elem = find_with_itemprop(prop)
  elem && !elem.content.empty?
end
job_location() click to toggle source
# File lib/jobparser/parseschema.rb, line 65
def job_location
  if @doc.css("*").any? { |elem| elem['itemtype'] == "http://schema.org/PostalAddress" }
    fields = %w{ streetAddress addressLocality addressRegion addressCountry postalCode }
    address = []
    fields.each do |field|
      content = get_content_at_prop(field)
      address.push(content) unless content.empty?
    end
    address.join(", ")
  else
    # some sites don't use the address stuff properly
    if is_content_at_prop?("addressLocality")
      get_content_at_prop("addressLocality")
    else
      get_content_at_prop("jobLocation")
    end
  end
end
job_postcode() click to toggle source
# File lib/jobparser/parseschema.rb, line 84
def job_postcode
  POSTCODE_REGEX.match(job_location) { |m| return m[0] }
end
job_salary() click to toggle source
# File lib/jobparser/parseschema.rb, line 46
def job_salary
  salary = job_salary_string
  SALARY_GROUP_REGEX.match(salary.gsub(CLEAN_SALARY_REGEX, "")) { |match|
    [match[1].to_i, match[2].to_i]
  }
end
job_salary_string() click to toggle source
# File lib/jobparser/parseschema.rb, line 61
def job_salary_string
  get_content_at_prop("baseSalary")
end
job_title() click to toggle source
# File lib/jobparser/parseschema.rb, line 53
def job_title
  get_content_at_prop("title")
end