Code with Finding: |
class TaggedPdfReaderTool {
/**
* Searches for a tag in a page.
*
* @param tag
* the name of the tag
* @param object
* an identifier to find the marked content
* @param page
* a page dictionary
* @throws IOException
*/
public void parseTag(String tag, PdfObject object, PdfDictionary page)
throws IOException {
// if the identifier is a number, we can extract the content right away
if (object instanceof PdfNumber) {
PdfNumber mcid = (PdfNumber) object;
RenderFilter filter = new MarkedContentRenderFilter(mcid.intValue());
TextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
FilteredTextRenderListener listener = new FilteredTextRenderListener(
strategy, filter);
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
listener);
processor.processContent(PdfReader.getPageContent(page), page
.getAsDict(PdfName.RESOURCES));
out.print(XMLUtil.escapeXML(listener.getResultantText(), true));
}
// if the identifier is an array, we call the parseTag method
// recursively
else if (object instanceof PdfArray) {
PdfArray arr = (PdfArray) object;
int n = arr.size();
for (int i = 0; i < n; i++) {
parseTag(tag, arr.getPdfObject(i), page);
if (i < n - 1)
out.println();
}
}
// if the identifier is a dictionary, we get the resources from the
// dictionary
else if (object instanceof PdfDictionary) {
PdfDictionary mcr = (PdfDictionary) object;
parseTag(tag, mcr.getDirectObject(PdfName.MCID), mcr
.getAsDict(PdfName.PG));
}
}
}
|