PDF to CSV

Getting Started

Use Case

Extract tabular data from unstructured PDF document into CSV format.

Resources

Integration

The SDK provides two options for integrating into your project using a Command Line Utility or programatically.

Click here to create your free trial license key.

> Command Line

PDFix provides simple and fast automated PDF processing using a command line. PDFix Command Line Utility is the easiest way to integrate the SDK functionality into your solution available for Windows, MacOS and Linux. Learn more about the Command Line Utility.

$ cd /pdfix_mac/bin
$ ./pdfix_app support@pdfix.net 3bE31NaixzFE58ir -pdf2table /Users/admin/Documents/input.pdf output_csv

Output:

PDF to TABLE
2 tables found
Success

This command extracts tables detected in the PDF into CSV files. Output should point to the folder where separate CSV files will be saved.

{ Code }

These code samples show how to extract tables from a PDF document and save them to CSV output. Code integration into your project allows you to take full control of the PDF data processing:

#include <string>
#include <iostream>
#include <fstream>
#include "Pdfix.h"
// Example how to extract tables from a PDF document and save them to csv format.
// GetText processes each element recursively. If the element is a text, saves it to the output stream.
void GetText(PdeText* element, std::ofstream& ofs, bool eof) {
PdeText* text_elem = static_cast<PdeText*>(element);
std::wstring text;
text.resize(text_elem->GetText(nullptr, 0));
text_elem->GetText((wchar_t*)text.c_str(), (int)text.size());
std::string str = ToUtf8(text);
ofs << str;
if (eof)
ofs << std::endl;
}
// SaveTable processes each element recursively.
// If the element is a table, it saves it to save_path as csv.
void SaveTable(PdeElement* element, std::wstring save_path, int& table_index) {
Pdfix* pdfix = GetPdfix();
PdfElementType elem_type = element->GetType();
if (elem_type == kPdeTable) {
PdeTable* table = static_cast<PdeTable*>(element);
auto path = save_path + L"/ExtractTables_" + std::to_wstring(table_index++) + L".csv";
std::ofstream ofs;
ofs.open(ToUtf8(path));
int row_count = table->GetNumRows();
int col_count = table->GetNumCols();
for (int row = 0; row < row_count; row++) {
for (int col = 0; col < col_count; col++) {
PdeCell* cell = (PdeCell*)table->GetCell(row, col);
if (!cell)
continue;
int row_span = cell->GetRowSpan();
int col_span = cell->GetColSpan();
int count = cell->GetNumChildren();
if ((row_span != 0) && (col_span != 0) && (count > 0)) {
ofs << "\"";
for (int i = 0; i < count; i++) {
PdeElement* child = cell->GetChild(i);
if (child && (child->GetType() == kPdeText)) {
GetText((PdeText*)child, ofs, false);
}
if (i < count - 1) {
ofs << " ";
}
}
ofs << "\"";
}
if (col < col_count)
ofs << ",";
}
if (row < row_count)
ofs << std::endl;
}
ofs.close();
}
else {
int count = element->GetNumChildren();
if (count == 0)
return;
for (int i = 0; i < count; i++) {
PdeElement* child = element->GetChild(i);
if (child)
SaveTable(child, save_path, table_index);
}
}
}
// Extracts all tables from the document and saves them to CSV format.
const std::wstring& email, // authorization email
const std::wstring& license_key, // authorization license key
const std::wstring& open_path, // source PDF document
const std::wstring& save_path // directory where to extract images
) {
// initialize Pdfix
if (!Pdfix_init(Pdfix_MODULE_NAME))
throw std::runtime_error("Pdfix initialization fail");
Pdfix* pdfix = GetPdfix();
if (!pdfix)
throw std::runtime_error("GetPdfix fail");
if (!pdfix->Authorize(email.c_str(), license_key.c_str()))
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
PdfDoc* doc = pdfix->OpenDoc(open_path.c_str(), L"");
if (!doc)
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
int table_index = 1;
auto num_pages = doc->GetNumPages();
for (auto i = 0; i < num_pages; i++) {
std::cout << std::endl;
std::cout << "Processing pages..." << i + 1 << "/" << num_pages;
PdfPage* page = doc->AcquirePage(i);
if (!page)
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
PdePageMap* page_map = page->AcquirePageMap(nullptr, nullptr);
if (!page_map)
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
auto element = page_map->GetElement();
if (!element)
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
SaveTable(element, save_path, table_index);
page->Release();
}
std::cout << std::endl << table_index - 1 << " tables found" << std::endl;
doc->Close();
pdfix->Destroy();
}
using System;
using PDFixSDK.Pdfix;
using System.IO;
namespace PDFix.App.Module
{
{
private static int tableIndex = 0;
// ParseText
private static void ParseText(PdeText text, StreamWriter file, bool eof)
{
string text_buffer = text.GetText();
file.Write(text_buffer);
if (eof)
file.Write("\n");
}
// ParseTable
private static void ParseTable(PdeTable table, String outDir)
{
StreamWriter file = new System.IO.StreamWriter(outDir + "\\ExtractTables" + tableIndex++ + ".csv");
int rowCount = table.GetNumRows();
int colCount = table.GetNumCols();
for (int row = 0; row < rowCount; row++)
{
for (int col = 0; col < colCount; col++)
{
PdeCell cell = (PdeCell)table.GetCell(row, col);
if (cell == null)
continue;
int rowSpan = cell.GetRowSpan();
int colSpan = cell.GetColSpan();
int count = cell.GetNumChildren();
if ((rowSpan != 0) && (colSpan != 0) && (count > 0))
{
file.Write("\"");
for (int i = 0; i < count; i++)
{
PdeElement child = cell.GetChild(i);
if (child != null && (child.GetType_() == PdfElementType.kPdeText))
{
ParseText((PdeText)child, file, false);
}
if (i < count - 1)
{
file.Write(" ");
}
}
file.Write("\"");
}
if (col < colCount)
file.Write(",");
}
if (row < rowCount)
file.Write("\n");
}
file.Close();
}
// ParseElement
private static void ParseElement(PdeElement element, String outDir)
{
// parse element based on type;
PdfElementType elemType = element.GetType_();
switch (elemType)
{
case PdfElementType.kPdeTable:
ParseTable((PdeTable)element, outDir);
return;
}
int numChilds = element.GetNumChildren();
for (int i = 0; i < numChilds; i++)
{
ParseElement(element.GetChild(i), outDir);
}
}
// ParsePage
private static void ParsePage(Pdfix pdfix, PdfPage page, String outDir)
{
// get pageMap for the current page
PdePageMap pageMap = page.AcquirePageMap(null, IntPtr.Zero);
if (pageMap == null)
throw new Exception(pdfix.GetError());
// get page container
PdeElement container = pageMap.GetElement();
if (container == null)
throw new Exception(pdfix.GetError());
// parse children recursivelly
ParseElement(container, outDir);
pageMap.Release();
}
public static void Run(
String email, // authorization email
String licenseKey, // authorization license key
String openPath, // source PDF document
String savePath, // output TXT document
String configPath // configuration file
)
{
Pdfix pdfix = new Pdfix();
if (pdfix == null)
throw new Exception("Pdfix initialization fail");
if (!pdfix.Authorize(email, licenseKey))
throw new Exception(pdfix.GetError());
PdfDoc doc = pdfix.OpenDoc(openPath, "");
if (doc == null)
throw new Exception(pdfix.GetError());
// iterate through pages and parse each page individually
for (int i = 0; i < doc.GetNumPages(); i++)
{
PdfPage page = doc.AcquirePage(i);
if (page == null)
throw new Exception(pdfix.GetError());
ParsePage(pdfix, page, savePath);
page.Release();
}
Console.WriteLine(tableIndex + " tables detected");
doc.Close();
pdfix.Destroy();
}
}
}
package net.pdfix.samples;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import net.pdfix.Utils;
import net.pdfix.pdfixlib.*;
public class ExtractTables {
private static int tableIndex = 0;
private static void GetText (PdeElement element, StringBuilder ss,
boolean eof) {
PdfElementType elemType = element.GetType();
if (PdfElementType.kPdeText == elemType) {
PdeText textElem = (PdeText)element;
String str = textElem.GetText();
ss.append(str);
if (eof)
ss.append("\n");
else {
int count = element.GetNumChildren();
if (count == 0)
return;
for (int i = 0; i < count; i = i + 1) {
PdeElement child = element.GetChild(i);
if (child != null)
GetText(child, ss, eof);
}
}
}
}
private static void SaveTable(
PdeElement element,
String savePath
) throws Exception {
PdfElementType elemType = element.GetType();
if (elemType == PdfElementType.kPdeTable) {
PdeTable table = (PdeTable)element;
StringBuilder ofs = new StringBuilder();
String path = savePath + "/ExtractTables_" + Integer.toString (tableIndex++) + ".csv";
int rowCount = table.GetNumRows();
int colCount = table.GetNumCols();
for (int row = 0; row < rowCount; row++) {
for (int col = 0; col < colCount; col++) {
PdeCell cell = (PdeCell)table.GetCell(row, col);
if (cell == null)
continue;
int rowSpan = cell.GetRowSpan();
int colSpan = cell.GetColSpan();
int count = cell.GetNumChildren();
if ((rowSpan != 0) && (colSpan != 0) && (count > 0)) {
ofs.append("\"");
for (int i = 0; i < count; i++) {
PdeElement child = cell.GetChild(i);
if (child.GetType() == PdfElementType.kPdeText) {
GetText(child, ofs, false);
}
if (i < count)
ofs.append(" ");
}
ofs.append("\"");
}
if (col < colCount)
ofs.append("\n");
}
if (row < rowCount)
ofs.append("\n");
}
File file = new File(path);
try (BufferedWriter writer = new BufferedWriter(new FileWriter(file))) {
writer.write(ofs.toString());
}
}
else {
int count = element.GetNumChildren();
if (count == 0)
return;
for (int i = 0; i < count; i++) {
PdeElement child = element.GetChild(i);
if (child != null)
SaveTable(child, savePath);
}
}
}
public static void run(
String email, // authorization email
String licenseKey, // license key
String openPath, // source PDF document
String savePath, // directory where to extract tables
String configPath
) throws Exception {
System.load(Utils.getAbsolutePath(Utils.getModuleName("pdfix")));
Pdfix pdfix = new Pdfix();
if (pdfix == null)
throw new Exception("Pdfix initialization fail");
if (!pdfix.Authorize(email, licenseKey))
throw new Exception(pdfix.GetError());
PdfDoc doc = pdfix.OpenDoc(openPath, "");
if (doc == null)
throw new Exception(pdfix.GetError());
int tableIndex= 1;
int numPages = doc.GetNumPages();
for (int i = 0; i < numPages; i++) {
PdfPage page = doc.AcquirePage(i);
if (page == null)
throw new Exception(pdfix.GetError());
PdePageMap pageMap = page.AcquirePageMap();
if (pageMap == null)
throw new Exception(pdfix.GetError());
PdeElement element = pageMap.GetElement();
if (element == null)
throw new Exception(pdfix.GetError());
SaveTable(element, savePath);
page.Release();
}
doc.Close();
pdfix.Destroy();
}
}

Result

Found tables extracted to separate CSV files as displayed below:

RAW CSV data:

"Year","England/","Holland/","Italy","Spain",
,"Great Britain","The Netherlands",,,
"1086","754",,,,
"1270","759",,,"957",
"1300","755",,"1,482","957",
"1348","777","876","1,376","1,030",
"1400","1,090","1,245","1,601","885",
"1450","1,055","1,432","1,668","889",
"1500","1,114","1,483","1,403","889",
"1570","1,143","1,783","1,337","990",
"1600","1,123","2,372","1,244","944",
"1650","1,100","2,171","1,271","820",
"1700","1,630","2,403","1,350","880",
,"1,563",,,,
"1750","1,710","2,440","1,403","910",
"1800","2,080","2,617","1,244","962",
,,"1,752",,,
"1820","2,133","1,953","1,376","1,087",
"1850","2,997","2,397","1,350","1,144",

Data Imported to Excel:

Customizing the output

PDFix SDK allows customization of the output by using configuration files that affect table detection process and the output structure. To learn more about the configuration files please follow the Documentation. When using the SDK programatically there are no limits to fit the output your needs.

Contact us if you need help with integration.

Windows, MacOS, Linux

Java, Python, C#, C++