Difference between revisions of "R: tidytext RPJP BAPPENAS"

From OnnoWiki
Jump to navigation Jump to search
(Created page with " install.packages("pdftools") library(pdftools) rpjp2005 <- pdf_text("RPJP_2005-2025.pdf") %>% strsplit(split = "\n") original_rpjp2005 <- rpjp2005 %>% group_by(book...")
 
Line 1: Line 1:
  
 +
library(tidyverse)
 +
library(tidytext)
 +
library(tm)
 +
directory <- "data-pdf"
 +
 +
# create corpus from pdfs
 +
converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>%
 +
  DocumentTermMatrix()
 +
 +
converted %>%
 +
  tidy() %>%
 +
  filter(!grepl("[0-9]+", term))
  
install.packages("pdftools")
 
library(pdftools)
 
rpjp2005 <- pdf_text("RPJP_2005-2025.pdf") %>% strsplit(split = "\n")
 
 
 
original_rpjp2005 <- rpjp2005 %>%
 
  group_by(book) %>%
 
  mutate(linenumber = row_number(),
 
          chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
 
                                                  ignore_case = TRUE)))) %>%
 
  ungroup()
 
original_books
 
  
  

Revision as of 11:36, 6 November 2018

library(tidyverse)
library(tidytext)
library(tm)
directory <- "data-pdf"

# create corpus from pdfs
converted <- VCorpus(DirSource(directory), readerControl = list(reader = readPDF)) %>% 
  DocumentTermMatrix()

converted %>%
  tidy() %>%
  filter(!grepl("[0-9]+", term))


Pranala Menarik