Add Tags to PDF

Getting Started

Use Case

Add Tags to PDF to improve text and graphics extraction.

Resources

Integration

The SDK provides two options for integrating into your project using a Command Line Utility or programatically.

Click here to create your free trial license key.

> Command Line

PDFix provides simple and fast automated PDF processing using a command line. PDFix Command Line Utility is the easiest way to integrate the SDK functionality into your solution available for Windows, MacOS and Linux. Learn more about the Command Line Utility.

$ cd /pdfix_mac/bin
$ ./pdfix_app support@pdfix.net 3bE31NaixzFE58ir -addtags /Users/admin/Documents/input.pdf output_csv

Output:

Add Tags to PDF
Success

This command adds tags to PDF. No options are currently available for CLI.

{ Code }

These code samples show how to add tags to a PDF document. Code integration into your project allows you to take full control of the PDF data processing:

#include <string>
#include <iostream>
#include "Pdfix.h"
void AddTags(
const std::wstring& email, // authorization email
const std::wstring& license_key, // authorization license key
const std::wstring& open_path, // source PDF document
const std::wstring& save_path, // output PDF document
const std::wstring& config_path // configuration file
) {
// initialize Pdfix
if (!Pdfix_init(Pdfix_MODULE_NAME))
throw std::runtime_error("Pdfix initialization fail");
Pdfix* pdfix = GetPdfix();
if (!pdfix)
throw std::runtime_error("GetPdfix fail");
if (!pdfix->Authorize(email.c_str(), license_key.c_str()))
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
PdfDoc* doc = pdfix->OpenDoc(open_path.c_str(), L"");
if (!doc)
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
// customize auto-tagging
if (!config_path.empty()) {
PdfDocTemplate* doc_tmpl = doc->GetDocTemplate();
if (!doc_tmpl)
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
PsFileStream* stm = pdfix->CreateFileStream(config_path.c_str(), kPsReadOnly);
if (stm) {
if (!doc_tmpl->LoadFromStream(stm, kDataFormatJson))
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
stm->Destroy();
}
}
if (!doc->AddTags(nullptr, nullptr))
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
if (!doc->Save(save_path.c_str(), kSaveFull))
throw std::runtime_error(std::to_string(GetPdfix()->GetErrorType()));
doc->Close();
pdfix->Destroy();
}
using System;
using PDFixSDK.Pdfix;
namespace PDFix.App.Module
{
class AddTags
{
public static void Run(
String email, // authorization email
String licenseKey, // authorization license key
String openPath, // source PDF document
String savePath, // output PDF document
String configPath // configuration file
)
{
Pdfix pdfix = new Pdfix();
if (pdfix == null)
throw new Exception("Pdfix initialization fail");
if (!pdfix.Authorize(email, licenseKey))
throw new Exception(pdfix.GetError());
PdfDoc doc = pdfix.OpenDoc(openPath, "");
if (doc == null)
throw new Exception(pdfix.GetError());
PsFileStream stm = pdfix.CreateFileStream(configPath, PsFileMode.kPsReadOnly);
if (stm != null)
{
PdfDocTemplate docTmpl = doc.GetDocTemplate();
if (docTmpl == null)
throw new Exception(pdfix.GetError());
docTmpl.LoadFromStream(stm, PsDataFormat.kDataFormatJson);
stm.Destroy();
}
// define a cancel progress callback
PdfCancelProc cancel_callback = (data) =>
{
// to cancel the process return 1
Console.WriteLine("PdfCancelProc callback was called");
return 0;
};
PdfPage page = doc.AcquirePage(0);
PdePageMap pageMap = page.AcquirePageMap(null, IntPtr.Zero);
// define an event callback
PdfEventProc event_callback = (data) =>
{
Console.WriteLine("Page contents did change. Releasing pageMap...");
if (pageMap != null)
{
pageMap.Release();
pageMap = null;
}
};
if (!pdfix.RegisterEvent(PdfEventType.kEventPageContentsDidChange, event_callback, IntPtr.Zero))
throw new Exception(pdfix.GetError());
if (!doc.AddTags(cancel_callback, IntPtr.Zero))
throw new Exception(pdfix.GetError());
if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
throw new Exception(pdfix.GetError());
doc.Close();
pdfix.Destroy();
}
}
}
package net.pdfix.samples;
import net.pdfix.Utils;
import net.pdfix.pdfixlib.*;
public class AddTags {
public static void run(
String email, // authorization email
String licenseKey, // authorization license key
String openPath, // source PDF document
String savePath, // output HTML file
String configPath // configuration file
) throws Exception {
System.load(Utils.getAbsolutePath(Utils.getModuleName("pdfix")));
Pdfix pdfix = new Pdfix();
if (pdfix == null)
throw new Exception("Pdfix initialization fail");
if (!pdfix.Authorize(email, licenseKey))
throw new Exception(pdfix.GetError());
PdfDoc doc = pdfix.OpenDoc(openPath, "");
if (doc == null)
throw new Exception(pdfix.GetError());
// customize auto-tagging
PsFileStream stm = pdfix.CreateFileStream(configPath, PsFileMode.kPsReadOnly);
if (stm != null) {
PdfDocTemplate docTmpl = doc.GetDocTemplate();
if (docTmpl == null)
throw new Exception(pdfix.GetError());
if (!docTmpl.LoadFromStream(stm, PsDataFormat.kDataFormatJson))
throw new Exception(pdfix.GetError());
stm.Destroy();
}
if (!doc.AddTags())
throw new Exception(pdfix.GetError());
if (!doc.Save(savePath, PdfSaveFlags.kSaveFull))
throw new Exception(pdfix.GetError());
doc.Close();
pdfix.Destroy();
}
}

Result

General configuration file:

Our engine uses general configuration file which should be ok for majority of cases. Here´s the output using this default configuration file:

Looking at the tag structure, we see that there is tagged only one table instead of two, separate tables. A similar case is also seen within the images – graphs, that also are tagged under one figure, instead of two. There is no perfect algorithm that works under all circumstances. Such an partially incorrect tag structure results in unsatisfactory output, for example in case of making the PDF Accessible or outputting the PDF content into responsive HTML layout.

For these cases, the PDFix SDK allows customization of the output by using custom configuration files that affect the particular elements detection process and the output tags structure.

Customizing the output

To improve the tagged output of our sample PDF document, we will use this custom JSON configuration file. To learn more about the configuration files please follow the Documentation. When using the SDK programatically there are no limits to fit the output your needs.

We can see the updated tag structure after applying the custom configuration file. For example, we set custom headings and pointed to table and image elements to consider within the detection process. Now we have more acceptable and usable tagged PDF output.

The PDFix SDK uses the generated tag structure for example to output the PDF content into responsive HTML layout. To compare the particular outputs, please follow these links:

Open the original responsive HTML output

Open the HTML output after applying custom config file

Contact us if you need help with integration.

Windows, MacOS, Linux

Java, Python, C#, C++