Difference between revisions of "Python: PDF"

From OnnoWiki
Jump to navigation Jump to search
Line 15: Line 15:
 
  # print the number of pages in pdf file
 
  # print the number of pages in pdf file
 
  print(fileReader.numPages)
 
  print(fileReader.numPages)
 +
 +
==textract==
 +
 +
 +
pip install textract
 +
 +
# for read pdf
 +
import textract
 +
text = textract.process('path/to/pdf/file', method='pdfminer')
 +
  
 
==Referensi==
 
==Referensi==
  
 
* http://pythonhosted.org/PyPDF2/
 
* http://pythonhosted.org/PyPDF2/
 +
* http://textract.readthedocs.io/en/stable/index.html

Revision as of 05:26, 25 October 2018

pyPDF2

#install pyPDF2
pip install PyPDF2

# importing all the required modules
import PyPDF2

# creating an object 
file = open('example.pdf', 'rb')

# creating a pdf reader object
fileReader = PyPDF2.PdfFileReader(file)

# print the number of pages in pdf file
print(fileReader.numPages)

textract

pip install textract
# for read pdf
import textract
text = textract.process('path/to/pdf/file', method='pdfminer')


Referensi