BE THE CODER - com/itextpdf/text/pdf/parser/TaggedPdfReaderTool.java


/*

 * $Id: TaggedPdfReaderTool.java 4813 2011-04-26 10:35:49Z blowagie $

 *

 * This file is part of the iText (R) project.

 * Copyright (c) 1998-2011 1T3XT BVBA

 * Authors: Bruno Lowagie, et al.

 *

 * This program is free software; you can redistribute it and/or modify

 * it under the terms of the GNU Affero General Public License version 3

 * as published by the Free Software Foundation with the addition of the

 * following permission added to Section 15 as permitted in Section 7(a):

 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,

 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.

 *

 * This program is distributed in the hope that it will be useful, but

 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

 * or FITNESS FOR A PARTICULAR PURPOSE.

 * See the GNU Affero General Public License for more details.

 * You should have received a copy of the GNU Affero General Public License

 * along with this program; if not, see http://www.gnu.org/licenses or write to

 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

 * Boston, MA, 02110-1301 USA, or download the license from the following URL:

 * http://itextpdf.com/terms-of-use/

 *

 * The interactive user interfaces in modified source and object code versions

 * of this program must display Appropriate Legal Notices, as required under

 * Section 5 of the GNU Affero General Public License.

 *

 * In accordance with Section 7(b) of the GNU Affero General Public License,

 * a covered work must retain the producer line in every PDF that is created

 * or manipulated using iText.

 *

 * You can be released from the requirements of the license by purchasing

 * a commercial license. Buying such a license is mandatory as soon as you

 * develop commercial activities involving the iText software without

 * disclosing the source code of your own applications.

 * These activities include: offering paid services to customers as an ASP,

 * serving PDFs on the fly in a web application, shipping iText with a closed

 * source product.

 *

 * For more information, please contact iText Software Corp. at this

 * address: [email protected]

 */

package com.itextpdf.text.pdf.parser;



import java.io.IOException;

import java.io.OutputStream;

import java.io.PrintWriter;



import com.itextpdf.text.error_messages.MessageLocalization;

import com.itextpdf.text.pdf.PdfArray;

import com.itextpdf.text.pdf.PdfDictionary;

import com.itextpdf.text.pdf.PdfName;

import com.itextpdf.text.pdf.PdfNumber;

import com.itextpdf.text.pdf.PdfObject;

import com.itextpdf.text.pdf.PdfReader;

import com.itextpdf.text.xml.XMLUtil;

import java.io.OutputStreamWriter;

import java.nio.charset.Charset;



/**

 * Converts a tagged PDF document into an XML file.

 * 

 * @since 5.0.2

 */

public class TaggedPdfReaderTool {



  /** The reader object from which the content streams are read. */

  PdfReader reader;

  /** The writer object to which the XML will be written */

  PrintWriter out;



  /**

   * Parses a string with structured content.

   * 

   * @param reader

   *            the PdfReader that has access to the PDF file

   * @param os

   *            the OutputStream to which the resulting xml will be written

   * @param charset

   *            the charset to encode the data

     * @since 5.0.5

   */

  public void convertToXml(PdfReader reader, OutputStream os, String charset)

      throws IOException {

    this.reader = reader;

        OutputStreamWriter outs = new OutputStreamWriter(os, charset);

    out = new PrintWriter(outs);

    // get the StructTreeRoot from the root object

    PdfDictionary catalog = reader.getCatalog();

    PdfDictionary struct = catalog.getAsDict(PdfName.STRUCTTREEROOT);

    if (struct == null)

      throw new IOException(MessageLocalization.getComposedMessage("no.structtreeroot.found"));

    // Inspect the child or children of the StructTreeRoot

    inspectChild(struct.getDirectObject(PdfName.K));

    out.flush();

    out.close();

  }



  /**

   * Parses a string with structured content. The output is done using the

     * current charset.

   *

   * @param reader

   *            the PdfReader that has access to the PDF file

   * @param os

   *            the OutputStream to which the resulting xml will be written

   */

  public void convertToXml(PdfReader reader, OutputStream os)

      throws IOException {

        convertToXml(reader, os, Charset.defaultCharset().name());

    }



    /**

   * Inspects a child of a structured element. This can be an array or a

   * dictionary.

   * 

   * @param k

   *            the child to inspect

   * @throws IOException

   */

  public void inspectChild(PdfObject k) throws IOException {

    if (k == null)

      return;

    if (k instanceof PdfArray)

      inspectChildArray((PdfArray) k);

    else if (k instanceof PdfDictionary)

      inspectChildDictionary((PdfDictionary) k);

  }



  /**

   * If the child of a structured element is an array, we need to loop over

   * the elements.

   * 

   * @param k

   *            the child array to inspect

   */

  public void inspectChildArray(PdfArray k) throws IOException {

    if (k == null)

      return;

    for (int i = 0; i < k.size(); i++) {

      inspectChild(k.getDirectObject(i));

    }

  }



  /**

   * If the child of a structured element is a dictionary, we inspect the

   * child; we may also draw a tag.

   * 

   * @param k

   *            the child dictionary to inspect

   */

  public void inspectChildDictionary(PdfDictionary k) throws IOException {

    if (k == null)

      return;

    PdfName s = k.getAsName(PdfName.S);

    if (s != null) {

            String tagN = PdfName.decodeName(s.toString());

      String tag = fixTagName(tagN);

      out.print("<");

      out.print(tag);

      out.print(">");

      PdfDictionary dict = k.getAsDict(PdfName.PG);

      if (dict != null)

        parseTag(tagN, k.getDirectObject(PdfName.K), dict);

      inspectChild(k.getDirectObject(PdfName.K));

      out.print("</");

      out.print(tag);

      out.println(">");

    } else

      inspectChild(k.getDirectObject(PdfName.K));

  }



    private static String fixTagName(String tag) {

        StringBuilder sb = new StringBuilder();

        for (int k = 0; k < tag.length(); ++k) {

            char c = tag.charAt(k);

            boolean nameStart =

                c == ':'

                || (c >= 'A' && c <= 'Z')

                || c == '_'

                || (c >= 'a' && c <= 'z')

                || (c >= '\u00c0' && c <= '\u00d6')

                || (c >= '\u00d8' && c <= '\u00f6')

                || (c >= '\u00f8' && c <= '\u02ff')

                || (c >= '\u0370' && c <= '\u037d')

                || (c >= '\u037f' && c <= '\u1fff')

                || (c >= '\u200c' && c <= '\u200d')

                || (c >= '\u2070' && c <= '\u218f')

                || (c >= '\u2c00' && c <= '\u2fef')

                || (c >= '\u3001' && c <= '\ud7ff')

                || (c >= '\uf900' && c <= '\ufdcf')

                || (c >= '\ufdf0' && c <= '\ufffd');

            boolean nameMiddle =

                c == '-'

                || c == '.'

                || (c >= '0' && c <= '9')

                || c == '\u00b7'

                || (c >= '\u0300' && c <= '\u036f')

                || (c >= '\u203f' && c <= '\u2040')

                || nameStart;

            if (k == 0) {

                if (!nameStart)

                    c = '_';

            }

            else {

                if (!nameMiddle)

                    c = '-';

            }

            sb.append(c);

        }

        return sb.toString();

    }



  /**

   * Searches for a tag in a page.

   * 

   * @param tag

   *            the name of the tag

   * @param object

   *            an identifier to find the marked content

   * @param page

   *            a page dictionary

   * @throws IOException

   */

  public void parseTag(String tag, PdfObject object, PdfDictionary page)

      throws IOException {

    // if the identifier is a number, we can extract the content right away

    if (object instanceof PdfNumber) {

      PdfNumber mcid = (PdfNumber) object;

      RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());

      TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();

      FilteredTextRenderListener listener = new FilteredTextRenderListener(

          strategy, filter);

      PdfContentStreamProcessor processor = new PdfContentStreamProcessor(

          listener);

      processor.processContent(PdfReader.getPageContent(page), page

          .getAsDict(PdfName.RESOURCES));

      out.print(XMLUtil.escapeXML(listener.getResultantText(), true));

    }

    // if the identifier is an array, we call the parseTag method

    // recursively

    else if (object instanceof PdfArray) {

      PdfArray arr = (PdfArray) object;

      int n = arr.size();

      for (int i = 0; i < n; i++) {

        parseTag(tag, arr.getPdfObject(i), page);

        if (i < n - 1)

          out.println();

      }

    }

    // if the identifier is a dictionary, we get the resources from the

    // dictionary

    else if (object instanceof PdfDictionary) {

      PdfDictionary mcr = (PdfDictionary) object;

      parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr

          .getAsDict(PdfName.PG));

    }

  }



}
Open Source Repository
Home	/itextpdf/itextpdf-5.1.2 \| Repository Home
Open Source Repository