R: read multi PDF ke tidytext

From OnnoWiki
Revision as of 10:06, 3 December 2019 by Onnowpurbo (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search


baca banyak PDF

library(pdftools)
library(tidyverse)
library(tidytext)

map_df(all_pdfs, ~ data_frame(txt = pdf_text(.x)) %>%
    mutate(filename = .x) %>%
    unnest_tokens(word, txt))


# list PDF file yang ada
all_pdfs <- list.files(pattern = ".pdf$")


dengan tidytext dan tm

library(tidyverse)
library(tidytext)
library(tm)
directory <- "data-pdf"
# create corpus from pdfs
converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% 
  DocumentTermMatrix()
converted %>%
  tidy() %>%
  filter(!grepl("[0-9]+", term))

dengan tidytest tanpa tm

directory <- "data-pdf"

pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "")
pdf_names <- list.files(directory, pattern = "*.pdf")
pdfs_text <- map(pdfs, pdftools::pdf_text)

my_data <- data_frame(document = pdf_names, text = pdfs_text)

my_data %>% 
  unnest %>% # pdfs_text is a list
  unnest_tokens(word, text, strip_numeric = TRUE) %>%  # removing all numbers
  group_by(document, word) %>% 
  summarise(count = n())


Pranala Menarik