Skip to contents

Convert files to markdown

Usage

read_as_markdown(x, ..., canonical = FALSE)

Arguments

x

A filepath or url. Accepts a wide variety of file types, including PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF metadata and speech transcription), HTML, Text-based formats (CSV, JSON, XML), ZIP files (iterates over contents), Youtube URLs, and EPubs.#'

...

Passed on to MarkItDown.convert()

canonical

logical, whether to postprocess the output from MarkItDown with commonmark::markdown_commonmark().

Value

A single string of markdown

Examples

if (FALSE) { # reticulate::py_available()
# convert html
read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
  substr(1, 1000) |>
  cat()

read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
  substr(1, 1000) |>
  cat()

# convert pdf
pdf <- file.path(R.home("doc"), "NEWS.pdf")
read_as_markdown(pdf) |> substr(1, 1000) |> cat()
## alternative:
# pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()

# convert images to markdown descriptions using OpenAI
jpg <- file.path(R.home("doc"), "html", "logo.jpg")
if (Sys.getenv("OPENAI_API_KEY") != "") {
  # if (xfun::is_macos()) system("brew install ffmpeg")
  reticulate::py_require("openai")
  llm_client <- reticulate::import("openai")$OpenAI()
  read_as_markdown(jpg, llm_client = llm_client, llm_model = "gpt-4.1-mini")
  # # Description:
  # The image displays the logo of the R programming language. It features a
  # large, stylized capital letter "R" in blue, positioned prominently in the
  # center. Surrounding the "R" is a gray oval shape that is open on the right
  # side, creating a dynamic and modern appearance. The R logo is commonly
  # associated with statistical computing, data analysis, and graphical
  # representation in various scientific and professional fields.
}

# Alternative approach to image conversion:
if (
  Sys.getenv("OPENAI_API_KEY") != "" &&
    rlang::is_installed("ellmer") &&
    rlang::is_installed("magick")
) {
  chat <- ellmer::chat_openai(echo = TRUE)
  chat$chat("Describe this image", ellmer::content_image_file(jpg))
}
}