As it is well known, Python is a general-purpose language, sometimes referred to as utilitarian, which is designed to be simple to read and write. The use of Big Data and Cloud Computing in the enterprise world has also helped Python to be successful and widely used.

PDFix, Python and Big Data

As we are talking about Big Data, AI, or Machine Learing regarding to PDFix SDK and its ability to scrape various data from PDF files, the Python support must not be absent.

Check out the Code Samples

Check out the first samples of the native PDF API in Python:

PDFix Initialization Sample:

from Pdfix import *
def Initialization(email, key):
print('Pdfix Initialization Sample')
pdfix = GetPdfix()
if pdfix is None:
raise Exception('Pdfix Initialization fail')
# check version
major = pdfix.GetVersionMajor()
minor = pdfix.GetVersionMinor()
patch = pdfix.GetVersionPatch()
print("PDFix SDK Version " + str(major) + "." + str(minor) + "." + str(patch))
# authorization
if not pdfix.Authorize(pdfix_email, pdfix_license):
raise Exception('Authorization fail : ' + pdfix.GetError())
# some code to execute
# cleanup
pdfix.Destroy()
try:
# pdfix initialization
Pdfix_init(pdfix_dll_path)
Initialization(pdfix_email, pdfix_license)
Pdfix_destroy()
except Exception as e:
print('Oops! ' + str(e))

Convert PDF to HTML:

from Pdfix import *
from PdfToHtml import *
def ConvertToHtml(email, key,
open_path, # PDF document to open
save_path, # output html document
config_path, # path to configuration file
html_params): # PdfHtmlParams structure
pdfix = GetPdfix()
if pdfix is None:
raise Exception('Pdfix Initialization fail')
# authorization
if not pdfix.Authorize(pdfix_email, pdfix_license):
raise Exception('Authorization fail : ' + pdfix.GetError())
pdfToHtml = GetPdfToHtml()
if pdfToHtml is None:
raise Exception('PdfToHtml Initialization fail')
if not pdfToHtml.Initialize(pdfix):
raise Exception('PdfToHtml Initialize Pdfix fail')
doc = pdfix.OpenDoc(open_path, "")
if doc is None:
raise Exception('Unable to open doc : ' + pdfix.GetError())
htmlDoc = pdfToHtml.OpenHtmlDoc(doc)
if htmlDoc is None:
raise Exception('Unable to open html doc : ' + pdfix.GetError())
# convert all pages at once
if not htmlDoc.Save(save_path, html_params, 0, None):
raise Exception('Unable to open html doc : ' + pdfix.GetError())
doc.Close()
pdfix.Destroy()
try:
# pdfix initialization
Pdfix_init(pdfix_dll_path)
PdfToHtml_init(pdf_to_html_dll_path)
htmlParams = PdfHtmlParams()
htmlParams.type = kPdfHtmlFixed
ConvertToHtml(pdfix_email, pdfix_license,
getAbsolutePath(pdfix_bin + './resources/test.pdf'),
getAbsolutePath(pdfix_bin + './output/index.html'),
getAbsolutePath(pdfix_bin + './resources/config.json'),
htmlParams)
PdfToHtml_destroy()
Pdfix_destroy()
except Exception as e:
print('Oops! ' + str(e))

Extract Text from PDF:

from Pdfix import *
def GetText (element, output):
elemType = element.GetType()
if kPdeText == elemType:
textElem = PdeText(element.obj)
text = textElem.GetText()
output.write(text)
output.write("\n")
else:
count = element.GetNumChildren()
if count == 0:
return
for i in range(0, count):
child = element.GetChild(i)
if child is not None:
GetText(child, output)
def ExtractText(email, key,
open_path, # PDF document to open
save_path, # output txt document
config_path): # path to configuration file
pdfix = GetPdfix()
if pdfix is None:
raise Exception('Pdfix Initialization fail')
# authorization
if not pdfix.Authorize(pdfix_email, pdfix_license):
raise Exception('Authorization fail : ' + pdfix.GetError())
doc = pdfix.OpenDoc(open_path, "")
if doc is None:
raise Exception('Unable to open doc : ' + pdfix.GetError())
# prepare the output file
output = open(save_path, "w")
numPages = doc.GetNumPages()
for i in range(0, numPages):
# acquire page
page = doc.AcquirePage(i)
if page is None:
raise Exception('Acquire Page fail : ' + pdfix.GetError())
# get the page map of the current page
pageMap = page.AcquirePageMap(0, None)
if pageMap is None:
raise Exception('Acquire PageMap fail : ' + pdfix.GetError())
# get page container
container = pageMap.GetElement()
if container is None:
raise Exception('Get page element failure : ' + pdfix.GetError())
GetText(container, output)
output.close()
doc.Close()
pdfix.Destroy()
try:
# pdfix initialization
Pdfix_init(pdfix_dll_path)
ExtractText(pdfix_email, pdfix_license,
getAbsolutePath(pdfix_bin + './resources/test.pdf'),
getAbsolutePath(pdfix_bin + './output/ExtractText.txt'),
getAbsolutePath(pdfix_bin + './resources/config.json'))
Pdfix_destroy()
except Exception as e:
print('Oops! ' + str(e))

Add Comment:

from Pdfix import *
def AddComment(email, key, open_path, save_path):
pdfix = GetPdfix()
if pdfix is None:
raise Exception('Pdfix Initialization fail')
# authorization
if not pdfix.Authorize(pdfix_email, pdfix_license):
raise Exception('Authorization fail : ' + pdfix.GetError())
doc = pdfix.OpenDoc(open_path, "")
if doc is None:
raise Exception('Unable to open pdf : ' + pdfix.GetError())
page = doc.AcquirePage(0)
if page is None:
raise Exception('Unable to acquire page : ' + pdfix.GetError())
cropBox = page.GetCropBox()
# place annotation to the middle of the page
annotRect = PdfRect()
annotRect.left = (cropBox.right + cropBox.left) / 2.0 - 10
annotRect.bottom = (cropBox.top + cropBox.bottom) / 2.0 - 10
annotRect.right = (cropBox.right + cropBox.left) / 2.0 + 10
annotRect.top = (cropBox.top + cropBox.bottom) / 2.0 + 10
annot = page.AddTextAnnot(-1, annotRect)
if annot is None:
raise Exception(pdfix.GetError())
annot.SetAuthor("Peter Brown")
annot.SetContents("This is my comment.")
annot.AddReply("Mark Fish", "This is some reply.")
doc.ReleasePage(page)
if not doc.Save(save_path, kSaveFull):
raise Exception(pdfix.GetError())
doc.Close()
pdfix.Destroy()
try:
# pdfix initialization
Pdfix_init(pdfix_dll_path)
AddComment(pdfix_email, pdfix_license,
getAbsolutePath(pdfix_bin + './resources/test.pdf'),
getAbsolutePath(pdfix_bin + './output/AddComment.pdf'))
Pdfix_destroy()
except Exception as e:
print('Oops! ' + str(e))

Need more samples now or an advanced solution?

We are always looking forward to adjusting solutions to meet the customer’s needs and expectations.  Give us some feedback to meet your requirements!

More PDFix Samples are here >>

Download the PDFix SDK to use in Python here >>