Skip to contents

Convert files to markdown

Usage

read_as_markdown(x, ..., canonical = FALSE)

Arguments

x

A filepath or url

...

These dots are for future extensions and must be empty.

canonical

logical, whether to postprocess the output from MarkItDown with commonmark::markdown_commonmark().

Value

A single string of markdown

Examples

# convert html
read_as_markdown("https://r4ds.hadley.nz/base-R.html") |>
  substr(1, 1000) |> cat()
#> # 27  A field guide to base R – R for Data Science (2e)
#> 
#> 1. [Program](./program.html)
#> 2. [27  A field guide to base R](./base-R.html)
#> 
#> [R for Data Science (2e)](./)
#> 
#> * [Welcome](./index.html)
#> * [Preface to the second edition](./preface-2e.html)
#> * [Introduction](./intro.html)
#> * [Whole game](./whole-game.html)
#> 
#>   + [1  Data visualization](./data-visualize.html)
#>   + [2  Workflow: basics](./workflow-basics.html)
#>   + [3  Data transformation](./data-transform.html)
#>   + [4  Workflow: code style](./workflow-style.html)
#>   + [5  Data tidying](./data-tidy.html)
#>   + [6  Workflow: scripts and projects](./workflow-scripts.html)
#>   + [7  Data import](./data-import.html)
#>   + [8  Workflow: getting help](./workflow-help.html)
#> * [Visualize](./visualize.html)
#> 
#>   + [9  Layers](./layers.html)
#>   + [10  Exploratory data analysis](./EDA.html)
#>   + [11  Communication](./communication.html)
#> * [Transform](./transform.html)
#> 
#>   + [12  Logical vectors](./logicals.html)
#>   + [13  Numbers](./numbers.html)
#>   + [14  String

read_as_markdown("https://r4ds.hadley.nz/base-R.html", canonical = TRUE) |>
  substr(1, 1000) |> cat()
#> # 27  A field guide to base R – R for Data Science (2e)
#> 
#> 1.  [Program](./program.html)
#> 2.  [27  A field guide to base R](./base-R.html)
#> 
#> [R for Data Science (2e)](./)
#> 
#>   - [Welcome](./index.html)
#> 
#>   - [Preface to the second edition](./preface-2e.html)
#> 
#>   - [Introduction](./intro.html)
#> 
#>   - [Whole game](./whole-game.html)
#>     
#>       - [1  Data visualization](./data-visualize.html)
#>       - [2  Workflow: basics](./workflow-basics.html)
#>       - [3  Data transformation](./data-transform.html)
#>       - [4  Workflow: code style](./workflow-style.html)
#>       - [5  Data tidying](./data-tidy.html)
#>       - [6  Workflow: scripts and projects](./workflow-scripts.html)
#>       - [7  Data import](./data-import.html)
#>       - [8  Workflow: getting help](./workflow-help.html)
#> 
#>   - [Visualize](./visualize.html)
#>     
#>       - [9  Layers](./layers.html)
#>       - [10  Exploratory data analysis](./EDA.html)
#>       - [11  Communication](./communication.html)
#> 
#>   - [Transform](./transform.html)
#>     
#>       - [12  Logi

# convert pdf
pdf <- file.path(R.home("doc"), "NEWS.pdf")
read_as_markdown(pdf) |> substr(1, 1000) |> cat()
#> NEWS for R version 4.5.0 (2025-04-11)
#> 
#> NEWS
#> 
#> R News
#> 
#> CHANGES IN R 4.5.0
#> 
#> NEW FEATURES:
#> 
#> (cid:136) as.integer(rl) and hence as.raw(rl) now work for a list of raw(1) elements, as
#> 
#> proposed by Michael Chirico’s PR#18696.
#> 
#> (cid:136) graphics’ grid() gains optional argument nintLog.
#> (cid:136) New functions check_package_urls() and check_package_dois() in package tools
#> 
#> for checking URLs and DOIs in package sources.
#> 
#> (cid:136) New head() and tail() methods for class "ts" time series, proposed by Spencer
#> 
#> Graves on R-devel.
#> 
#> (cid:136) New qr.influence() function, a (cid:16)bare bones(cid:17) interface to the lm.influence() leave-
#> 
#> one-out diagnostics computations; wished for in PR#18739.
#> 
#> (cid:136) Package citation() results auto-generated from the package metadata now also pro-
#> 
#> vide package DOIs for CRAN and Bioconductor packages.
#> 
#> (cid:136) New function grepv() identical to grep() except for the default value = TRUE.
#> (cid:136) methods(<pkg>:::<genfun>) now does report methods when neither 
## alternative:
# pdftools::pdf_text(pdf) |> substr(1, 2000) |> cat()

# convert images
jpg <- file.path(R.home("doc"), "html", "logo.jpg")
if (FALSE) {
  # system("brew install ffmpeg")
  reticulate::py_require("openai")
  llm_client <- reticulate::import("openai")$OpenAI()
  read_as_markdown(jpg,
    llm_client = llm_client,
    llm_model = "gpt-4o"
  )
  # # Description:
  # The image features the official logo of the R programming language.
  # Prominently displayed is a bold, blue letter "R," which serves as the
  # centerpiece of the design. Encircling the "R" is a two-toned,
  # stylized oval or ellipse with a gradient that transitions from dark
  # gray to light gray, creating a sense of motion and dynamics. R is an
  # open-source programming language widely used for statistical
  # computing, data analysis, and graphical representation. The logo
  # represents the language's focus on clarity, precision, and
  # versatility in handling complex data tasks.
}

# Alternative approach to image conversion:
if(FALSE) {
  if (Sys.getenv("OPENAI_API_KEY") != "") {
    rlang::check_installed(c("ellmer", "magick"))
    chat <- ellmer::chat_openai(echo = TRUE)
    chat$chat("Describe this image", ellmer::content_image_file(jpg))
  }
}