public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine
Modifier and Type | Field and Description |
---|---|
private java.util.Map<java.lang.String,java.util.List<TextPosition>> |
characterListMapping |
private java.util.Stack<PDMarkedContent> |
currentMarkedContents |
private java.util.List<PDMarkedContent> |
markedContents |
private boolean |
suppressDuplicateOverlappingText |
Constructor and Description |
---|
PDFMarkedContentExtractor()
Instantiate a new PDFTextStripper object.
|
PDFMarkedContentExtractor(java.lang.String encoding)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
void |
beginMarkedContentSequence(COSName tag,
COSDictionary properties)
Called when a marked content group begins
|
void |
endMarkedContentSequence()
Called when a a marked content group ends
|
java.util.List<PDMarkedContent> |
getMarkedContents() |
protected void |
processTextPosition(TextPosition text)
This will process a TextPosition object and add the
text to the list of characters on a page.
|
private boolean |
within(float first,
float second,
float variance)
This will determine of two floating point numbers are within a specified variance.
|
void |
xobject(PDXObject xobject) |
processPage, showGlyph
addOperator, applyTextAdjustment, beginText, decreaseLevel, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getLevel, getResources, getTextLineMatrix, getTextMatrix, increaseLevel, operatorException, processAnnotation, processChildStream, processOperator, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showForm, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, transformedPoint, transformWidth, unsupportedOperator
private final boolean suppressDuplicateOverlappingText
private final java.util.List<PDMarkedContent> markedContents
private final java.util.Stack<PDMarkedContent> currentMarkedContents
private final java.util.Map<java.lang.String,java.util.List<TextPosition>> characterListMapping
public PDFMarkedContentExtractor() throws java.io.IOException
java.io.IOException
public PDFMarkedContentExtractor(java.lang.String encoding) throws java.io.IOException
encoding
- The encoding that the output will be written in.java.io.IOException
private boolean within(float first, float second, float variance)
first
- The first number to compare to.second
- The second number to compare to.variance
- The allowed variance.public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
PDFStreamEngine
beginMarkedContentSequence
in class PDFStreamEngine
tag
- indicates the role or significance of the sequenceproperties
- optional propertiespublic void endMarkedContentSequence()
PDFStreamEngine
endMarkedContentSequence
in class PDFStreamEngine
public void xobject(PDXObject xobject)
protected void processTextPosition(TextPosition text)
processTextPosition
in class LegacyPDFStreamEngine
text
- The text to process.public java.util.List<PDMarkedContent> getMarkedContents()