ragnar_read()
uses markitdown to
convert a document to markdown. If frame_by_tags
or split_by_tags
is
provided, the converted markdown content is then split and converted to a
data frame, otherwise, the markdown is returned as a string.
Value
Always returns a data frame with the columns:
origin
: the file path or urlhash
: a hash of the text contenttext
: the markdown content
If split_by_tags
is not NULL
, then a tag
column is also included containing
the corresponding tag for each text chunk. ""
is used for text chunks that
are not associated with a tag.
If frame_by_tags
is not NULL
, then additional columns are included for each
tag in frame_by_tags
. The text chunks are associated with the tags in the
order they appear in the markdown content.
Examples
if (FALSE) { # reticulate::py_available()
file <- tempfile(fileext = ".html")
download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)
# with no arguments, returns a single row data frame.
# the markdown content is in the `text` column.
file |> ragnar_read() |> str()
# use `split_by_tags` to get a data frame where the text is split by the
# specified tags (e.g., "h1", "h2", "h3")
file |>
ragnar_read(split_by_tags = c("h1", "h2", "h3"))
# use `frame_by_tags` to get a dataframe where the
# headings associated with each text chunk are easily accessible
file |>
ragnar_read(frame_by_tags = c("h1", "h2", "h3"))
# use `split_by_tags` and `frame_by_tags` together to further break up `text`.
file |>
ragnar_read(
split_by_tags = c("p"),
frame_by_tags = c("h1", "h2", "h3")
)
# Example workflow adding context to each chunk
file |>
ragnar_read(frame_by_tags = c("h1", "h2", "h3")) |>
glue::glue_data(r"--(
## Excerpt from the book "R for Data Science (2e)"
chapter: {h1}
section: {h2}
content: {text}
)--") |>
# inspect
_[6:7] |> cat(sep = "\n~~~~~~~~~~~\n")
# Advanced example of postprocessing the output of ragnar_read()
# to add language to code blocks, markdown style
library(dplyr, warn.conflicts = FALSE)
library(stringr)
library(rvest)
library(xml2)
file |>
ragnar_read(frame_by_tags = c("h1", "h2", "h3"),
split_by_tags = c("p", "pre")) |>
mutate(
is_code = tag == "pre",
text = ifelse(is_code, str_replace(text, "```", "```r"), text)
) |>
group_by(h1, h2, h3) |>
summarise(text = str_flatten(text, "\n\n"), .groups = "drop") |>
glue::glue_data(r"--(
# Excerpt from the book "R for Data Science (2e)"
chapter: {h1}
section: {h2}
content: {text}
)--") |>
# inspect
_[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")
}