mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
minor optimisations
This commit is contained in:
parent
755e199fed
commit
4312aa470e
@ -23,8 +23,8 @@
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
@ -91,8 +91,8 @@
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements, int k,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
@ -159,8 +159,8 @@
|
||||
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements,
|
||||
Func<PdfLine, PdfLine, double> distMeasure,
|
||||
|
@ -14,19 +14,19 @@
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure.
|
||||
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// left or right edge of the page.</para>
|
||||
/// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
|
||||
/// </summary>
|
||||
public static class DecorationTextBlockClassifier
|
||||
{
|
||||
private static readonly Regex NumbersPattern = new Regex(@"(\d+)|(\b([MDCLXVI]+)\b)", RegexOptions.IgnoreCase);
|
||||
private static string replacementChar = "@";
|
||||
private const string replacementChar = "@";
|
||||
|
||||
/// <summary>
|
||||
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
|
||||
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// left or right edge of the page.</para>
|
||||
/// </summary>
|
||||
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
|
||||
@ -34,8 +34,8 @@
|
||||
/// <param name="pageSegmenter"></param>
|
||||
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
|
||||
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
|
||||
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
|
||||
@ -47,7 +47,7 @@
|
||||
/// <summary>
|
||||
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
|
||||
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// left or right edge of the page.</para>
|
||||
/// </summary>
|
||||
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
|
||||
@ -56,8 +56,8 @@
|
||||
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
|
||||
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
|
||||
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
|
||||
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func<string, string, double> minimumEditDistanceNormalised,
|
||||
@ -92,14 +92,14 @@
|
||||
/// <summary>
|
||||
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
|
||||
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// left or right edge of the page.</para>
|
||||
/// </summary>
|
||||
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
|
||||
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
|
||||
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
|
||||
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
|
||||
@ -110,15 +110,15 @@
|
||||
/// <summary>
|
||||
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
|
||||
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
|
||||
/// left or right edge of the page.</para>
|
||||
/// </summary>
|
||||
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
|
||||
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
|
||||
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
|
||||
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
|
||||
Func<string, string, double> minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
|
||||
@ -219,7 +219,7 @@
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// [The content similarity] is calculated from the normalized edit
|
||||
/// [The content similarity] is calculated from the normalized edit
|
||||
/// distance between the two content strings, where digits are replaced with “@” chars.
|
||||
/// A content similarity of 1 is reached when both strings are exactly equal.
|
||||
/// </summary>
|
||||
@ -248,7 +248,7 @@
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// This similarity score is a value in the range [0,1] and given
|
||||
/// This similarity score is a value in the range [0,1] and given
|
||||
/// by the product between the content and the geometric similarity.
|
||||
/// </summary>
|
||||
private static double Similarity(TextBlock b1, TextBlock b2, Func<string, string, double> minimumEditDistanceNormalised)
|
||||
|
@ -15,7 +15,7 @@
|
||||
/// <returns>The mode of the sequence. Returns <see cref="float.NaN"/> if the sequence has no mode or if it is not unique.</returns>
|
||||
public static float Mode(this IEnumerable<float> array)
|
||||
{
|
||||
if (array == null || !array.Any()) return float.NaN;
|
||||
if (array?.Any() != true) return float.NaN;
|
||||
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
|
||||
var mode = sorted.First();
|
||||
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return float.NaN;
|
||||
@ -29,7 +29,7 @@
|
||||
/// <returns>The mode of the sequence. Returns <see cref="double.NaN"/> if the sequence has no mode or if it is not unique.</returns>
|
||||
public static double Mode(this IEnumerable<double> array)
|
||||
{
|
||||
if (array == null || !array.Any()) return double.NaN;
|
||||
if (array?.Any() != true) return double.NaN;
|
||||
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
|
||||
var mode = sorted.First();
|
||||
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return double.NaN;
|
||||
|
@ -15,13 +15,13 @@
|
||||
/// </summary>
|
||||
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
|
||||
|
||||
private double T;
|
||||
private readonly double T;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order.
|
||||
/// </summary>
|
||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.
|
||||
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
|
||||
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
|
||||
/// same column might not be exactly aligned.</param>
|
||||
public UnsupervisedReadingOrderDetector(double T = 5)
|
||||
{
|
||||
@ -38,10 +38,10 @@
|
||||
|
||||
var graph = BuildGraph(textBlocks, T);
|
||||
|
||||
while (graph.Any())
|
||||
while (graph.Count > 0)
|
||||
{
|
||||
var maxCount = graph.Max(kvp => kvp.Value.Count);
|
||||
var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
|
||||
var current = graph.FirstOrDefault(kvp => kvp.Value.Count == maxCount);
|
||||
graph.Remove(current.Key);
|
||||
int index = current.Key;
|
||||
|
||||
@ -105,19 +105,14 @@
|
||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
||||
|
||||
if (xRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Overlaps)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return xRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Overlaps;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
|
||||
/// </summary>
|
||||
@ -130,7 +125,7 @@
|
||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
||||
|
||||
if (xRelation == IntervalRelations.Precedes ||
|
||||
return xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
@ -146,12 +141,7 @@
|
||||
xRelation == IntervalRelations.DuringI ||
|
||||
xRelation == IntervalRelations.Finishes ||
|
||||
xRelation == IntervalRelations.StartsI ||
|
||||
xRelation == IntervalRelations.OverlapsI)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
xRelation == IntervalRelations.OverlapsI));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@ -160,40 +150,34 @@
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T.</param>
|
||||
/// <returns></returns>
|
||||
private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
||||
|
||||
if (yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps)) ||
|
||||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
|
||||
(yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Starts ||
|
||||
yRelation == IntervalRelations.FinishesI ||
|
||||
yRelation == IntervalRelations.Equals ||
|
||||
yRelation == IntervalRelations.During ||
|
||||
yRelation == IntervalRelations.DuringI ||
|
||||
yRelation == IntervalRelations.Finishes ||
|
||||
yRelation == IntervalRelations.StartsI ||
|
||||
yRelation == IntervalRelations.OverlapsI)))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps)) ||
|
||||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
|
||||
(yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Starts ||
|
||||
yRelation == IntervalRelations.FinishesI ||
|
||||
yRelation == IntervalRelations.Equals ||
|
||||
yRelation == IntervalRelations.During ||
|
||||
yRelation == IntervalRelations.DuringI ||
|
||||
yRelation == IntervalRelations.Finishes ||
|
||||
yRelation == IntervalRelations.StartsI ||
|
||||
yRelation == IntervalRelations.OverlapsI));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
|
||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
@ -201,85 +185,83 @@
|
||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
||||
private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = IntervalRelations.Unknown;
|
||||
|
||||
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
|
||||
{
|
||||
xRelation = IntervalRelations.Precedes;
|
||||
return IntervalRelations.Precedes;
|
||||
}
|
||||
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
|
||||
{
|
||||
xRelation = IntervalRelations.PrecedesI;
|
||||
return IntervalRelations.PrecedesI;
|
||||
}
|
||||
|
||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
|
||||
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
|
||||
{
|
||||
xRelation = IntervalRelations.Meets;
|
||||
return IntervalRelations.Meets;
|
||||
}
|
||||
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
|
||||
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
|
||||
{
|
||||
xRelation = IntervalRelations.MeetsI;
|
||||
return IntervalRelations.MeetsI;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
|
||||
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
|
||||
{
|
||||
xRelation = IntervalRelations.Overlaps;
|
||||
return IntervalRelations.Overlaps;
|
||||
}
|
||||
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
|
||||
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
|
||||
{
|
||||
xRelation = IntervalRelations.OverlapsI;
|
||||
return IntervalRelations.OverlapsI;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
|
||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
|
||||
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.Starts;
|
||||
return IntervalRelations.Starts;
|
||||
}
|
||||
else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
|
||||
else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T
|
||||
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.StartsI;
|
||||
return IntervalRelations.StartsI;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
|
||||
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.During;
|
||||
return IntervalRelations.During;
|
||||
}
|
||||
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
|
||||
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
|
||||
{
|
||||
xRelation = IntervalRelations.DuringI;
|
||||
return IntervalRelations.DuringI;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
|
||||
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
|
||||
{
|
||||
xRelation = IntervalRelations.Finishes;
|
||||
return IntervalRelations.Finishes;
|
||||
}
|
||||
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
|
||||
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
|
||||
{
|
||||
xRelation = IntervalRelations.FinishesI;
|
||||
return IntervalRelations.FinishesI;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
|
||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
|
||||
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
|
||||
{
|
||||
xRelation = IntervalRelations.Equals;
|
||||
return IntervalRelations.Equals;
|
||||
}
|
||||
|
||||
return xRelation;
|
||||
return IntervalRelations.Unknown;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
|
||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||
/// </summary>
|
||||
/// <param name="a"></param>
|
||||
@ -287,79 +269,77 @@
|
||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
||||
private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
|
||||
{
|
||||
IntervalRelations yRelation = IntervalRelations.Unknown;
|
||||
|
||||
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
|
||||
{
|
||||
yRelation = IntervalRelations.PrecedesI;
|
||||
return IntervalRelations.PrecedesI;
|
||||
}
|
||||
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
|
||||
{
|
||||
yRelation = IntervalRelations.Precedes;
|
||||
return IntervalRelations.Precedes;
|
||||
}
|
||||
|
||||
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
|
||||
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
|
||||
{
|
||||
yRelation = IntervalRelations.MeetsI;
|
||||
return IntervalRelations.MeetsI;
|
||||
}
|
||||
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
|
||||
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
|
||||
{
|
||||
yRelation = IntervalRelations.Meets;
|
||||
return IntervalRelations.Meets;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
|
||||
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
|
||||
{
|
||||
yRelation = IntervalRelations.OverlapsI;
|
||||
return IntervalRelations.OverlapsI;
|
||||
}
|
||||
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
|
||||
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
|
||||
{
|
||||
yRelation = IntervalRelations.Overlaps;
|
||||
return IntervalRelations.Overlaps;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
|
||||
else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T
|
||||
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.StartsI;
|
||||
return IntervalRelations.StartsI;
|
||||
}
|
||||
else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
|
||||
else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T
|
||||
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.Starts;
|
||||
return IntervalRelations.Starts;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
|
||||
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.DuringI;
|
||||
return IntervalRelations.DuringI;
|
||||
}
|
||||
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
|
||||
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
|
||||
{
|
||||
yRelation = IntervalRelations.During;
|
||||
return IntervalRelations.During;
|
||||
}
|
||||
|
||||
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
|
||||
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
|
||||
{
|
||||
yRelation = IntervalRelations.FinishesI;
|
||||
return IntervalRelations.FinishesI;
|
||||
}
|
||||
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
|
||||
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
|
||||
{
|
||||
yRelation = IntervalRelations.Finishes;
|
||||
return IntervalRelations.Finishes;
|
||||
}
|
||||
|
||||
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
|
||||
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
|
||||
{
|
||||
yRelation = IntervalRelations.Equals;
|
||||
return IntervalRelations.Equals;
|
||||
}
|
||||
|
||||
return yRelation;
|
||||
return IntervalRelations.Unknown;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -29,15 +29,15 @@
|
||||
/// </summary>
|
||||
/// <param name="pageWords">The words in the page.</param>
|
||||
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4,
|
||||
int maxDegreeOfParallelism = -1)
|
||||
{
|
||||
if (minimumElements < 0)
|
||||
{
|
||||
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
|
||||
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", nameof(minimumElements));
|
||||
}
|
||||
|
||||
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
|
||||
@ -46,10 +46,7 @@
|
||||
|
||||
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
|
||||
|
||||
Parallel.ForEach(edgesFuncs, parallelOptions, f =>
|
||||
{
|
||||
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
|
||||
});
|
||||
Parallel.ForEach(edgesFuncs, parallelOptions, f => dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)));
|
||||
return dictionary.ToDictionary(x => x.Key, x => x.Value);
|
||||
}
|
||||
|
||||
|
@ -68,7 +68,7 @@
|
||||
else if (previous.Value != " ")
|
||||
{
|
||||
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
|
||||
|
||||
|
||||
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
|
||||
{
|
||||
sb.Append(" ");
|
||||
|
@ -39,7 +39,7 @@
|
||||
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
|
||||
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
|
||||
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
|
||||
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
|
||||
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
|
||||
/// surrounding obstacles by some percent. Default value is 15%.</param>
|
||||
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
|
||||
/// <returns>The identified whitespace rectangles.</returns>
|
||||
@ -49,7 +49,7 @@
|
||||
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
|
||||
.Select(o => o.BoundingBox).ToList();
|
||||
|
||||
if (images != null && images.Count() > 0)
|
||||
if (images?.Any() == true)
|
||||
{
|
||||
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
|
||||
}
|
||||
@ -69,14 +69,14 @@
|
||||
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
|
||||
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
|
||||
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
|
||||
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
|
||||
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
|
||||
/// surrounding obstacles by some percent. Default value is 15%.</param>
|
||||
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
|
||||
/// <returns>The identified whitespace rectangles.</returns>
|
||||
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<PdfRectangle> boundingboxes,
|
||||
double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0)
|
||||
{
|
||||
if (boundingboxes.Count() == 0) return EmptyArray<PdfRectangle>.Instance;
|
||||
if (!boundingboxes.Any()) return EmptyArray<PdfRectangle>.Instance;
|
||||
|
||||
var obstacles = new HashSet<PdfRectangle>(boundingboxes);
|
||||
var pageBound = GetBound(obstacles);
|
||||
@ -195,51 +195,32 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
if (rectangle1.Left == rectangle2.Right ||
|
||||
rectangle1.Right == rectangle2.Left ||
|
||||
rectangle1.Bottom == rectangle2.Top ||
|
||||
rectangle1.Top == rectangle2.Bottom)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
return rectangle1.Left == rectangle2.Right ||
|
||||
rectangle1.Right == rectangle2.Left ||
|
||||
rectangle1.Bottom == rectangle2.Top ||
|
||||
rectangle1.Top == rectangle2.Bottom;
|
||||
}
|
||||
|
||||
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
|
||||
{
|
||||
if (rectangle.Bottom == pageBound.Bottom ||
|
||||
rectangle.Top == pageBound.Top ||
|
||||
rectangle.Left == pageBound.Left ||
|
||||
rectangle.Right == pageBound.Right)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return rectangle.Bottom == pageBound.Bottom ||
|
||||
rectangle.Top == pageBound.Top ||
|
||||
rectangle.Left == pageBound.Left ||
|
||||
rectangle.Right == pageBound.Right;
|
||||
}
|
||||
|
||||
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle1.Left >= rectangle2.Right ||
|
||||
rectangle2.Left >= rectangle1.Right ||
|
||||
rectangle1.Top <= rectangle2.Bottom ||
|
||||
rectangle2.Top <= rectangle1.Bottom)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return rectangle1.Left < rectangle2.Right &&
|
||||
rectangle2.Left < rectangle1.Right &&
|
||||
rectangle1.Top > rectangle2.Bottom &&
|
||||
rectangle2.Top > rectangle1.Bottom;
|
||||
}
|
||||
|
||||
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
|
||||
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
|
||||
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom;
|
||||
}
|
||||
|
||||
private static PdfRectangle GetBound(IEnumerable<PdfRectangle> obstacles)
|
||||
@ -254,7 +235,7 @@
|
||||
#region Sorted Queue
|
||||
private class QueueEntries : SortedSet<QueueEntry>
|
||||
{
|
||||
readonly int bound;
|
||||
private readonly int bound;
|
||||
|
||||
public QueueEntries(int maximumBound)
|
||||
{
|
||||
@ -306,7 +287,7 @@
|
||||
|
||||
public bool IsEmptyEnough()
|
||||
{
|
||||
return !Obstacles.Any();
|
||||
return Obstacles.Count == 0;
|
||||
}
|
||||
|
||||
public bool IsEmptyEnough(IEnumerable<PdfRectangle> pageObstacles)
|
||||
@ -349,12 +330,11 @@
|
||||
{
|
||||
if (obj is QueueEntry entry)
|
||||
{
|
||||
if (Bound.Left != entry.Bound.Left ||
|
||||
Bound.Right != entry.Bound.Right ||
|
||||
Bound.Top != entry.Bound.Top ||
|
||||
Bound.Bottom != entry.Bound.Bottom ||
|
||||
Obstacles != entry.Obstacles) return false;
|
||||
return true;
|
||||
return Bound.Left == entry.Bound.Left &&
|
||||
Bound.Right == entry.Bound.Right &&
|
||||
Bound.Top == entry.Bound.Top &&
|
||||
Bound.Bottom == entry.Bound.Bottom &&
|
||||
Obstacles == entry.Obstacles;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -383,16 +363,6 @@
|
||||
// solution.
|
||||
return rectangle.Area * (rectangle.Height / 4.0);
|
||||
}
|
||||
|
||||
private static double OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
var intersect = rectangle1.Intersect(rectangle2);
|
||||
if (intersect.HasValue)
|
||||
{
|
||||
return intersect.Value.Area;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
|
@ -99,8 +99,8 @@
|
||||
/// <param name="filterPivotFunction"></param>
|
||||
/// <param name="filterFunction">Function used to filter out connection between letters, e.g. check if the letters have the same color.
|
||||
/// <para>If the function returns false, a new word will be created.</para></param>
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
|
||||
/// <para>A positive property value limits the number of concurrent operations to the set value.
|
||||
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
|
||||
private List<Word> GetWords(IReadOnlyList<Letter> letters,
|
||||
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
|
Loading…
Reference in New Issue
Block a user