# Time-stamp: <2019-03-15 09:38:14 chl> library(wordcloud) cwd <- "~/Sites/aliquote/content/micro" f <- list.files(path = cwd, pattern = "\\.md", full.names = TRUE)[-1] skip <- c("h/t", "via", "nbsp", "just", "like") strip.htag <- function(s) gsub("`\\#[a-z]+`", "", s) strip.html <- function(s) gsub("\\((.*)\\)", "", s) strip.raw.html <- function(s) gsub("<.*>", "", s) strip.hugo <- function(s) gsub("\\{+(.*)\\}+", "", s) strip.itunes <- function(s) gsub("  ", "", s) # first enclosing has already been removed strip.char <- function(s) gsub(paste(skip, collapse = "|"), "", s) strip.short <- function(s) gsub('\\b\\w{1,4}\\b','', s) strip.punct <- function(s) gsub("[[:punct:]]", " ", s) reader <- function(x) { r <- scan(x, what = "character")[-c(1:10)] r <- paste(r, collapse = " ") r <- strip.htag(r) r <- strip.html(r) r <- strip.raw.html(r) r <- strip.hugo(r) r <- strip.itunes(r) r <- strip.char(r) r <- strip.short(r) ## probably useless since wordcloud relies on tm r <- strip.punct(r) return(gsub("\\s+", " ", r)) } l <- lapply(f, reader) png("~/Desktop/wc-micro.png", width = 600, height = 600) wordcloud(unlist(l)) dev.off()