ragnar_read_document {ragnar}R Documentation

Read an HTML document

Description

Read an HTML document

Usage

ragnar_read_document(
  x,
  ...,
  split_by_tags = frame_by_tags,
  frame_by_tags = NULL
)

Arguments

x

file path or url, passed on to rvest::read_html(), or an xml_node.

...

passed on to rvest::read_html()

split_by_tags

character vector of html tag names used to split the returned text

frame_by_tags

character vector of html tag names used to create a dataframe of the returned content

Value

If frame_by_tags is not NULL, then a data frame is returned, with column names c("frame_by_tags", "text").

If frame_by_tags is NULL but split_by_tags is not NULL, then a named character vector is returned.

If both frame_by_tags and split_by_tags are NULL, then a string (length-1 character vector) is returned.

Examples

file <- tempfile(fileext = ".html")
download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)

# with no arguments, returns a single string of the text.
file |> ragnar_read_document() |> str()

# use `split_by_tags` to get a named character vector of length > 1
file |>
  ragnar_read_document(split_by_tags = c("h1", "h2", "h3")) |>
  tibble::enframe("tag", "text")

# use `frame_by_tags` to get a dataframe where the
# headings associated with each text chunk are easily accessible
file |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3"))

# use `split_by_tags` and `frame_by_tags` together to further break up `text`.
file |>
  ragnar_read_document(
    split_by_tags = c("p"),
    frame_by_tags = c("h1", "h2", "h3")
  )

# Example workflow adding context to each chunk
file |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3")) |>
  glue::glue_data(r"--(
    ## Excerpt from the book "R for Data Science (2e)"
    chapter: {h1}
    section: {h2}
    content: {text}

    )--") |>
    # inspect
    _[6:7] |> cat(sep = "\n~~~~~~~~~~~\n")

# Advanced example of postprocessing the output of ragnar_read_document()
# to wrap code blocks in backticks, markdown style
library(dplyr, warn.conflicts = FALSE)
library(stringr)
library(rvest)
library(xml2)
file |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3"),
                       split_by_tags = c("p", "pre")) |>
  mutate(
    is_code = tag == "pre",
    text = ifelse(is_code,
                  str_c("```", text, "```", sep = "\n"),
                  text)) |>
  group_by(h1, h2, h3) |>
  summarise(text = str_flatten(text, "\n"), .groups = "drop") |>
  glue::glue_data(r"--(
    # Excerpt from the book "R for Data Science (2e)"
    chapter: {h1}
    section: {h2}
    content: {text}

    )--") |>
    # inspect
    _[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")

# Example of preprocessing the input to ragnar_read_document()
# to wrap code in backticks, markdown style
# same outcome as above, except via pre processing instead of post processing.
file |>
  read_html() |>
  (\(doc) {
    # fence preformatted code with triple backticks
    for (node in html_elements(doc, "pre")) {
      xml_add_child(node, "code", "```\n", .where = 0)
      xml_add_child(node, "code", "\n```")
    }
    # wrap inline code with single backticks
    for (node in html_elements(doc, "code")) {
      if (!"pre" %in% xml_name(xml_parents(node))) {
        xml_text(node) <- str_c("`", xml_text(node), "`")
      }
    }
    doc
  })() |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3")) |>
  glue::glue_data(r"--(
    # Excerpt from the book "R for Data Science (2e)"
    chapter: {h1}
    section: {h2}
    content: {text}

    )--") |> _[6]

[Package ragnar version 0.2.0 Index]