Difference between revisions of "R: read multi PDF ke tidytext"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " library(tidyverse) library(tidytext) library(tm) directory <- "data-pdf" # create corpus from pdfs converted <- VCorpus(DirSource(directory), readerControl = list(rea...") |
Onnowpurbo (talk | contribs) |
||
| (One intermediate revision by the same user not shown) | |||
| Line 1: | Line 1: | ||
| + | |||
| + | ==baca banyak PDF== | ||
| + | |||
| + | library(pdftools) | ||
| + | library(tidyverse) | ||
| + | library(tidytext) | ||
| + | |||
| + | map_df(all_pdfs, ~ data_frame(txt = pdf_text(.x)) %>% | ||
| + | mutate(filename = .x) %>% | ||
| + | unnest_tokens(word, txt)) | ||
| + | |||
| + | |||
| + | # list PDF file yang ada | ||
| + | all_pdfs <- list.files(pattern = ".pdf$") | ||
| + | |||
| + | |||
| + | ==dengan tidytext dan tm== | ||
library(tidyverse) | library(tidyverse) | ||
| Line 14: | Line 31: | ||
filter(!grepl("[0-9]+", term)) | filter(!grepl("[0-9]+", term)) | ||
| + | ==dengan tidytest tanpa tm== | ||
| + | |||
| + | directory <- "data-pdf" | ||
| + | |||
| + | pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "") | ||
| + | pdf_names <- list.files(directory, pattern = "*.pdf") | ||
| + | pdfs_text <- map(pdfs, pdftools::pdf_text) | ||
| + | |||
| + | my_data <- data_frame(document = pdf_names, text = pdfs_text) | ||
| + | |||
| + | my_data %>% | ||
| + | unnest %>% # pdfs_text is a list | ||
| + | unnest_tokens(word, text, strip_numeric = TRUE) %>% # removing all numbers | ||
| + | group_by(document, word) %>% | ||
| + | summarise(count = n()) | ||
Latest revision as of 10:06, 3 December 2019
baca banyak PDF
library(pdftools)
library(tidyverse)
library(tidytext)
map_df(all_pdfs, ~ data_frame(txt = pdf_text(.x)) %>%
mutate(filename = .x) %>%
unnest_tokens(word, txt))
# list PDF file yang ada all_pdfs <- list.files(pattern = ".pdf$")
dengan tidytext dan tm
library(tidyverse) library(tidytext) library(tm) directory <- "data-pdf"
# create corpus from pdfs converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% DocumentTermMatrix()
converted %>%
tidy() %>%
filter(!grepl("[0-9]+", term))
dengan tidytest tanpa tm
directory <- "data-pdf" pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "") pdf_names <- list.files(directory, pattern = "*.pdf") pdfs_text <- map(pdfs, pdftools::pdf_text) my_data <- data_frame(document = pdf_names, text = pdfs_text) my_data %>% unnest %>% # pdfs_text is a list unnest_tokens(word, text, strip_numeric = TRUE) %>% # removing all numbers group_by(document, word) %>% summarise(count = n())