Difference between revisions of "R: read multi PDF ke tidytext"

From OnnoWiki
Jump to navigation Jump to search
(Created page with " library(tidyverse) library(tidytext) library(tm) directory <- "data-pdf" # create corpus from pdfs converted <- VCorpus(DirSource(directory), readerControl = list(rea...")
 
Line 1: Line 1:
  
 +
==dengan tidytext dan tm==
  
 
  library(tidyverse)
 
  library(tidyverse)
Line 14: Line 15:
 
   filter(!grepl("[0-9]+", term))
 
   filter(!grepl("[0-9]+", term))
  
 +
==dengan tidytest tanpa tm==
 +
 +
directory <- "data-pdf"
 +
 +
pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "")
 +
pdf_names <- list.files(directory, pattern = "*.pdf")
 +
pdfs_text <- map(pdfs, pdftools::pdf_text)
 +
 +
my_data <- data_frame(document = pdf_names, text = pdfs_text)
 +
 +
my_data %>%
 +
  unnest %>% # pdfs_text is a list
 +
  unnest_tokens(word, text, strip_numeric = TRUE) %>%  # removing all numbers
 +
  group_by(document, word) %>%
 +
  summarise(count = n())
  
  

Revision as of 11:30, 6 November 2018

dengan tidytext dan tm

library(tidyverse)
library(tidytext)
library(tm)
directory <- "data-pdf"
# create corpus from pdfs
converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% 
  DocumentTermMatrix()
converted %>%
  tidy() %>%
  filter(!grepl("[0-9]+", term))

dengan tidytest tanpa tm

directory <- "data-pdf"

pdfs <- paste(directory, "/", list.files(directory, pattern = "*.pdf"), sep = "")
pdf_names <- list.files(directory, pattern = "*.pdf")
pdfs_text <- map(pdfs, pdftools::pdf_text)

my_data <- data_frame(document = pdf_names, text = pdfs_text)

my_data %>% 
  unnest %>% # pdfs_text is a list
  unnest_tokens(word, text, strip_numeric = TRUE) %>%  # removing all numbers
  group_by(document, word) %>% 
  summarise(count = n())


Pranala Menarik