ragnar_read_document {ragnar} | R Documentation |
Read an HTML document
Description
Read an HTML document
Usage
ragnar_read_document(
x,
...,
split_by_tags = frame_by_tags,
frame_by_tags = NULL
)
Arguments
x |
file path or url, passed on to |
... |
passed on to |
split_by_tags |
character vector of html tag names used to split the returned text |
frame_by_tags |
character vector of html tag names used to create a dataframe of the returned content |
Value
If frame_by_tags
is not NULL
, then a data frame is returned,
with column names c("frame_by_tags", "text")
.
If frame_by_tags
is NULL
but split_by_tags
is not NULL
, then a named
character vector is returned.
If both frame_by_tags
and split_by_tags
are NULL
, then a string
(length-1 character vector) is returned.
Examples
file <- tempfile(fileext = ".html")
download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)
# with no arguments, returns a single string of the text.
file |> ragnar_read_document() |> str()
# use `split_by_tags` to get a named character vector of length > 1
file |>
ragnar_read_document(split_by_tags = c("h1", "h2", "h3")) |>
tibble::enframe("tag", "text")
# use `frame_by_tags` to get a dataframe where the
# headings associated with each text chunk are easily accessible
file |>
ragnar_read_document(frame_by_tags = c("h1", "h2", "h3"))
# use `split_by_tags` and `frame_by_tags` together to further break up `text`.
file |>
ragnar_read_document(
split_by_tags = c("p"),
frame_by_tags = c("h1", "h2", "h3")
)
# Example workflow adding context to each chunk
file |>
ragnar_read_document(frame_by_tags = c("h1", "h2", "h3")) |>
glue::glue_data(r"--(
## Excerpt from the book "R for Data Science (2e)"
chapter: {h1}
section: {h2}
content: {text}
)--") |>
# inspect
_[6:7] |> cat(sep = "\n~~~~~~~~~~~\n")
# Advanced example of postprocessing the output of ragnar_read_document()
# to wrap code blocks in backticks, markdown style
library(dplyr, warn.conflicts = FALSE)
library(stringr)
library(rvest)
library(xml2)
file |>
ragnar_read_document(frame_by_tags = c("h1", "h2", "h3"),
split_by_tags = c("p", "pre")) |>
mutate(
is_code = tag == "pre",
text = ifelse(is_code,
str_c("```", text, "```", sep = "\n"),
text)) |>
group_by(h1, h2, h3) |>
summarise(text = str_flatten(text, "\n"), .groups = "drop") |>
glue::glue_data(r"--(
# Excerpt from the book "R for Data Science (2e)"
chapter: {h1}
section: {h2}
content: {text}
)--") |>
# inspect
_[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")
# Example of preprocessing the input to ragnar_read_document()
# to wrap code in backticks, markdown style
# same outcome as above, except via pre processing instead of post processing.
file |>
read_html() |>
(\(doc) {
# fence preformatted code with triple backticks
for (node in html_elements(doc, "pre")) {
xml_add_child(node, "code", "```\n", .where = 0)
xml_add_child(node, "code", "\n```")
}
# wrap inline code with single backticks
for (node in html_elements(doc, "code")) {
if (!"pre" %in% xml_name(xml_parents(node))) {
xml_text(node) <- str_c("`", xml_text(node), "`")
}
}
doc
})() |>
ragnar_read_document(frame_by_tags = c("h1", "h2", "h3")) |>
glue::glue_data(r"--(
# Excerpt from the book "R for Data Science (2e)"
chapter: {h1}
section: {h2}
content: {text}
)--") |> _[6]
[Package ragnar version 0.2.0 Index]