minor optimisations

This commit is contained in:
BobLd 2020-05-25 12:11:59 +01:00 committed by Eliot Jones
parent 755e199fed
commit 4312aa470e
8 changed files with 123 additions and 176 deletions

View File

@ -23,8 +23,8 @@
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements,
Func<PdfPoint, PdfPoint, double> distMeasure,
@ -91,8 +91,8 @@
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements, int k,
Func<PdfPoint, PdfPoint, double> distMeasure,
@ -159,8 +159,8 @@
/// <param name="candidatesLine">The candidates' line to use for pairing.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IEnumerable<IReadOnlyList<T>> NearestNeighbours<T>(IReadOnlyList<T> elements,
Func<PdfLine, PdfLine, double> distMeasure,

View File

@ -14,19 +14,19 @@
/// <summary>
/// Algorithm that retrieve blocks that are labelled as decoration (e.g. headers, footers) for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
/// </summary>
public static class DecorationTextBlockClassifier
{
private static readonly Regex NumbersPattern = new Regex(@"(\d+)|(\b([MDCLXVI]+)\b)", RegexOptions.IgnoreCase);
private static string replacementChar = "@";
private const string replacementChar = "@";
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
@ -34,8 +34,8 @@
/// <param name="pageSegmenter"></param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
@ -47,7 +47,7 @@
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
@ -56,8 +56,8 @@
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<Page> pages,
IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func<string, string, double> minimumEditDistanceNormalised,
@ -92,14 +92,14 @@
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
@ -110,15 +110,15 @@
/// <summary>
/// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para>
/// </summary>
/// <param name="pagesTextBlocks">The <see cref="TextBlock"/>s of every pages in the document. All of them are needed for the algorithm to work.</param>
/// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
/// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
/// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyList<IReadOnlyList<TextBlock>> Get(IReadOnlyList<IReadOnlyList<TextBlock>> pagesTextBlocks,
Func<string, string, double> minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
@ -219,7 +219,7 @@
}
/// <summary>
/// [The content similarity] is calculated from the normalized edit
/// [The content similarity] is calculated from the normalized edit
/// distance between the two content strings, where digits are replaced with “@” chars.
/// A content similarity of 1 is reached when both strings are exactly equal.
/// </summary>
@ -248,7 +248,7 @@
}
/// <summary>
/// This similarity score is a value in the range [0,1] and given
/// This similarity score is a value in the range [0,1] and given
/// by the product between the content and the geometric similarity.
/// </summary>
private static double Similarity(TextBlock b1, TextBlock b2, Func<string, string, double> minimumEditDistanceNormalised)

View File

@ -15,7 +15,7 @@
/// <returns>The mode of the sequence. Returns <see cref="float.NaN"/> if the sequence has no mode or if it is not unique.</returns>
public static float Mode(this IEnumerable<float> array)
{
if (array == null || !array.Any()) return float.NaN;
if (array?.Any() != true) return float.NaN;
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
var mode = sorted.First();
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return float.NaN;
@ -29,7 +29,7 @@
/// <returns>The mode of the sequence. Returns <see cref="double.NaN"/> if the sequence has no mode or if it is not unique.</returns>
public static double Mode(this IEnumerable<double> array)
{
if (array == null || !array.Any()) return double.NaN;
if (array?.Any() != true) return double.NaN;
var sorted = array.GroupBy(v => v).Select(v => (v.Count(), v.Key)).OrderByDescending(g => g.Item1);
var mode = sorted.First();
if (sorted.Count() > 1 && mode.Item1 == sorted.ElementAt(1).Item1) return double.NaN;

View File

@ -15,13 +15,13 @@
/// </summary>
public static UnsupervisedReadingOrderDetector Instance { get; } = new UnsupervisedReadingOrderDetector();
private double T;
private readonly double T;
/// <summary>
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allens interval relations and rendering order.
/// </summary>
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// This flexibility is necessary because due to the inherent noise in the PDF extraction text blocks in the
/// same column might not be exactly aligned.</param>
public UnsupervisedReadingOrderDetector(double T = 5)
{
@ -38,10 +38,10 @@
var graph = BuildGraph(textBlocks, T);
while (graph.Any())
while (graph.Count > 0)
{
var maxCount = graph.Max(kvp => kvp.Value.Count);
var current = graph.Where(kvp => kvp.Value.Count == maxCount).FirstOrDefault();
var current = graph.FirstOrDefault(kvp => kvp.Value.Count == maxCount);
graph.Remove(current.Key);
int index = current.Key;
@ -105,19 +105,14 @@
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (xRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Overlaps)
{
return true;
}
return false;
return xRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Overlaps;
}
/// <summary>
/// Column-wise: text-blocks are read in columns, from top-to-bottom and from left-to-right.
/// </summary>
@ -130,7 +125,7 @@
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (xRelation == IntervalRelations.Precedes ||
return xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
@ -146,12 +141,7 @@
xRelation == IntervalRelations.DuringI ||
xRelation == IntervalRelations.Finishes ||
xRelation == IntervalRelations.StartsI ||
xRelation == IntervalRelations.OverlapsI)))
{
return true;
}
return false;
xRelation == IntervalRelations.OverlapsI));
}
/// <summary>
@ -160,40 +150,34 @@
/// <param name="a"></param>
/// <param name="b"></param>
/// <param name="T">The tolerance parameter T.</param>
/// <returns></returns>
private bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
if (yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps)) ||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
(yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Starts ||
yRelation == IntervalRelations.FinishesI ||
yRelation == IntervalRelations.Equals ||
yRelation == IntervalRelations.During ||
yRelation == IntervalRelations.DuringI ||
yRelation == IntervalRelations.Finishes ||
yRelation == IntervalRelations.StartsI ||
yRelation == IntervalRelations.OverlapsI)))
{
return true;
}
return false;
return yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
xRelation == IntervalRelations.Meets ||
xRelation == IntervalRelations.Overlaps)) ||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
(yRelation == IntervalRelations.Precedes ||
yRelation == IntervalRelations.Meets ||
yRelation == IntervalRelations.Overlaps ||
yRelation == IntervalRelations.Starts ||
yRelation == IntervalRelations.FinishesI ||
yRelation == IntervalRelations.Equals ||
yRelation == IntervalRelations.During ||
yRelation == IntervalRelations.DuringI ||
yRelation == IntervalRelations.Finishes ||
yRelation == IntervalRelations.StartsI ||
yRelation == IntervalRelations.OverlapsI));
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
@ -201,85 +185,83 @@
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
private IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
{
IntervalRelations xRelation = IntervalRelations.Unknown;
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
{
xRelation = IntervalRelations.Precedes;
return IntervalRelations.Precedes;
}
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
{
xRelation = IntervalRelations.PrecedesI;
return IntervalRelations.PrecedesI;
}
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
{
xRelation = IntervalRelations.Meets;
return IntervalRelations.Meets;
}
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
{
xRelation = IntervalRelations.MeetsI;
return IntervalRelations.MeetsI;
}
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
{
xRelation = IntervalRelations.Overlaps;
return IntervalRelations.Overlaps;
}
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
{
xRelation = IntervalRelations.OverlapsI;
return IntervalRelations.OverlapsI;
}
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.Starts;
return IntervalRelations.Starts;
}
else if ((b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T)
else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.StartsI;
return IntervalRelations.StartsI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.During;
return IntervalRelations.During;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
{
xRelation = IntervalRelations.DuringI;
return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.Finishes;
return IntervalRelations.Finishes;
}
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.FinishesI;
return IntervalRelations.FinishesI;
}
else if ((b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T)
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
{
xRelation = IntervalRelations.Equals;
return IntervalRelations.Equals;
}
return xRelation;
return IntervalRelations.Unknown;
}
/// <summary>
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
/// </summary>
/// <param name="a"></param>
@ -287,79 +269,77 @@
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
private IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
{
IntervalRelations yRelation = IntervalRelations.Unknown;
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
{
yRelation = IntervalRelations.PrecedesI;
return IntervalRelations.PrecedesI;
}
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
{
yRelation = IntervalRelations.Precedes;
return IntervalRelations.Precedes;
}
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
{
yRelation = IntervalRelations.MeetsI;
return IntervalRelations.MeetsI;
}
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
{
yRelation = IntervalRelations.Meets;
return IntervalRelations.Meets;
}
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
{
yRelation = IntervalRelations.OverlapsI;
return IntervalRelations.OverlapsI;
}
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
{
yRelation = IntervalRelations.Overlaps;
return IntervalRelations.Overlaps;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.StartsI;
return IntervalRelations.StartsI;
}
else if ((b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T)
else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.Starts;
return IntervalRelations.Starts;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.DuringI;
return IntervalRelations.DuringI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
{
yRelation = IntervalRelations.During;
return IntervalRelations.During;
}
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.FinishesI;
return IntervalRelations.FinishesI;
}
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.Finishes;
return IntervalRelations.Finishes;
}
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
{
yRelation = IntervalRelations.Equals;
return IntervalRelations.Equals;
}
return yRelation;
return IntervalRelations.Unknown;
}
/// <summary>

View File

@ -29,15 +29,15 @@
/// </summary>
/// <param name="pageWords">The words in the page.</param>
/// <param name="minimumElements">The minimum number of elements to define a text edge.</param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
public static IReadOnlyDictionary<EdgeType, List<PdfLine>> GetEdges(IEnumerable<Word> pageWords, int minimumElements = 4,
int maxDegreeOfParallelism = -1)
{
if (minimumElements < 0)
{
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", "minimumElements");
throw new ArgumentException("TextEdgesExtractor.GetEdges(): The minimum number of elements should be positive.", nameof(minimumElements));
}
var cleanWords = pageWords.Where(x => !string.IsNullOrWhiteSpace(x.Text.Trim()));
@ -46,10 +46,7 @@
ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism };
Parallel.ForEach(edgesFuncs, parallelOptions, f =>
{
dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements));
});
Parallel.ForEach(edgesFuncs, parallelOptions, f => dictionary.TryAdd(f.Item1, GetVerticalEdges(cleanWords, f.Item2, minimumElements)));
return dictionary.ToDictionary(x => x.Key, x => x.Value);
}

View File

@ -68,7 +68,7 @@
else if (previous.Value != " ")
{
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
{
sb.Append(" ");

View File

@ -39,7 +39,7 @@
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
@ -49,7 +49,7 @@
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
.Select(o => o.BoundingBox).ToList();
if (images != null && images.Count() > 0)
if (images?.Any() == true)
{
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
}
@ -69,14 +69,14 @@
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<PdfRectangle> boundingboxes,
double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0)
{
if (boundingboxes.Count() == 0) return EmptyArray<PdfRectangle>.Instance;
if (!boundingboxes.Any()) return EmptyArray<PdfRectangle>.Instance;
var obstacles = new HashSet<PdfRectangle>(boundingboxes);
var pageBound = GetBound(obstacles);
@ -195,51 +195,32 @@
return false;
}
if (rectangle1.Left == rectangle2.Right ||
rectangle1.Right == rectangle2.Left ||
rectangle1.Bottom == rectangle2.Top ||
rectangle1.Top == rectangle2.Bottom)
{
return true;
}
return false;
return rectangle1.Left == rectangle2.Right ||
rectangle1.Right == rectangle2.Left ||
rectangle1.Bottom == rectangle2.Top ||
rectangle1.Top == rectangle2.Bottom;
}
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
{
if (rectangle.Bottom == pageBound.Bottom ||
rectangle.Top == pageBound.Top ||
rectangle.Left == pageBound.Left ||
rectangle.Right == pageBound.Right)
{
return true;
}
return false;
return rectangle.Bottom == pageBound.Bottom ||
rectangle.Top == pageBound.Top ||
rectangle.Left == pageBound.Left ||
rectangle.Right == pageBound.Right;
}
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle1.Left >= rectangle2.Right ||
rectangle2.Left >= rectangle1.Right ||
rectangle1.Top <= rectangle2.Bottom ||
rectangle2.Top <= rectangle1.Bottom)
{
return false;
}
return true;
return rectangle1.Left < rectangle2.Right &&
rectangle2.Left < rectangle1.Right &&
rectangle1.Top > rectangle2.Bottom &&
rectangle2.Top > rectangle1.Bottom;
}
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom)
{
return true;
}
return false;
return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom;
}
private static PdfRectangle GetBound(IEnumerable<PdfRectangle> obstacles)
@ -254,7 +235,7 @@
#region Sorted Queue
private class QueueEntries : SortedSet<QueueEntry>
{
readonly int bound;
private readonly int bound;
public QueueEntries(int maximumBound)
{
@ -306,7 +287,7 @@
public bool IsEmptyEnough()
{
return !Obstacles.Any();
return Obstacles.Count == 0;
}
public bool IsEmptyEnough(IEnumerable<PdfRectangle> pageObstacles)
@ -349,12 +330,11 @@
{
if (obj is QueueEntry entry)
{
if (Bound.Left != entry.Bound.Left ||
Bound.Right != entry.Bound.Right ||
Bound.Top != entry.Bound.Top ||
Bound.Bottom != entry.Bound.Bottom ||
Obstacles != entry.Obstacles) return false;
return true;
return Bound.Left == entry.Bound.Left &&
Bound.Right == entry.Bound.Right &&
Bound.Top == entry.Bound.Top &&
Bound.Bottom == entry.Bound.Bottom &&
Obstacles == entry.Obstacles;
}
return false;
}
@ -383,16 +363,6 @@
// solution.
return rectangle.Area * (rectangle.Height / 4.0);
}
private static double OverlappingArea(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
var intersect = rectangle1.Intersect(rectangle2);
if (intersect.HasValue)
{
return intersect.Value.Area;
}
return 0;
}
}
#endregion
}

View File

@ -99,8 +99,8 @@
/// <param name="filterPivotFunction"></param>
/// <param name="filterFunction">Function used to filter out connection between letters, e.g. check if the letters have the same color.
/// <para>If the function returns false, a new word will be created.</para></param>
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
/// <para>A positive property value limits the number of concurrent operations to the set value.
/// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
private List<Word> GetWords(IReadOnlyList<Letter> letters,
Func<Letter, Letter, double> maxDistanceFunction, Func<PdfPoint, PdfPoint, double> distMeasure,