I am working on a PDF text extractor with iText7 and am noticing strange text coordinates on a certain PDF. Most documents appear to yield x and y coordinates within the height and width of the page, but one seems to yield negatives. I was wondering if there was a standard approach to dealing with negative coordinates here. This basic approach is to use positive inch measurements from a PDF and to map them to iText7 extracted text and coordinates with a 1/72 scale value for inches per dot.
I am deriving from the LocationTextExtractionStrategy and code is as follows:
private class LocationTextListStrategy : LocationTextExtractionStrategy
{
private readonly List<TextRect> _textRects = new List<TextRect>();
public List<TextRect> TextRects() => _textRects;
public override void EventOccurred(IEventData data, EventType type)
{
if (!type.Equals(EventType.RENDER_TEXT))
return;
var renderInfo = (TextRenderInfo)data;
var text = renderInfo.GetCharacterRenderInfos();
foreach (var t in text)
{
if (string.IsNullOrWhiteSpace(t.GetText()))
continue;
AddTextRect(t);
}
}
private void AddTextRect(TextRenderInfo t)
{
var letterStart = t.GetBaseline().GetStartPoint();
var letterEnd = t.GetAscentLine().GetEndPoint();
var newTextRect = new TextRect(
text: t.GetText(),
l: letterStart.Get(0),
r: letterEnd.Get(0),
t: letterEnd.Get(1),
b: letterStart.Get(1));
_textRects.Add(newTextRect);
}
}
question from:
https://stackoverflow.com/questions/65887347/c-sharp-itext7-text-coordinate-extraction-question 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…