From 50178ee1ab6794e6d59f9e3328af3bfb5318b621 Mon Sep 17 00:00:00 2001 From: Looly Date: Wed, 15 Mar 2023 09:12:56 +0800 Subject: [PATCH] add NFA --- .../text/dfa/{Automaton.java => NFA.java} | 43 ++-- .../hutool/core/text/dfa/AutomatonTest.java | 225 ----------------- .../java/cn/hutool/core/text/dfa/NFATest.java | 228 ++++++++++++++++++ 3 files changed, 250 insertions(+), 246 deletions(-) rename hutool-core/src/main/java/cn/hutool/core/text/dfa/{Automaton.java => NFA.java} (73%) delete mode 100644 hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java create mode 100644 hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java diff --git a/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java b/hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java similarity index 73% rename from hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java rename to hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java index c34f8b676..468658a62 100644 --- a/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java +++ b/hutool-core/src/main/java/cn/hutool/core/text/dfa/NFA.java @@ -9,13 +9,13 @@ import java.util.*; * * @author renyp */ -public class Automaton { +public class NFA { private final Node root; /** * 默认构造 */ - public Automaton() { + public NFA() { this.root = new Node(); } @@ -24,7 +24,7 @@ public class Automaton { * * @param words 添加的新词 */ - public Automaton(String... words) { + public NFA(final String... words) { this(); this.insert(words); } @@ -34,14 +34,13 @@ public class Automaton { * * @param word 添加的新词 */ - public void insert(String word) { + public void insert(final String word) { Node p = root; - for (char curr : word.toCharArray()) { - int ind = curr; - if (p.next.get(ind) == null) { - p.next.put(ind, new Node()); + for (final char curr : word.toCharArray()) { + if (p.next.get((int) curr) == null) { + p.next.put((int) curr, new Node()); } - p = p.next.get(ind); + p = p.next.get((int) curr); } p.flag = true; p.str = word; @@ -52,8 +51,8 @@ public class Automaton { * * @param words 添加的新词 */ - public void insert(String... words) { - for (String word : words) { + public void insert(final String... words) { + for (final String word : words) { this.insert(word); } } @@ -62,15 +61,15 @@ public class Automaton { * 构建基于NFA模型的 AC自动机 */ public void buildAc() { - Queue queue = new LinkedList<>(); - Node p = root; - for (Integer key : p.next.keySet()) { + final Queue queue = new LinkedList<>(); + final Node p = root; + for (final Integer key : p.next.keySet()) { p.next.get(key).fail = root; queue.offer(p.next.get(key)); } while (!queue.isEmpty()) { - Node curr = queue.poll(); - for (Integer key : curr.next.keySet()) { + final Node curr = queue.poll(); + for (final Integer key : curr.next.keySet()) { Node fail = curr.fail; // 查找当前节点匹配失败,他对应等效匹配的节点是哪个 while (fail != null && fail.next.get(key) == null) { @@ -90,20 +89,22 @@ public class Automaton { /** * @param text 查询的文本(母串) + * @return 关键字列表 */ - public List find(String text) { + public List find(final String text) { return this.find(text, true); } /** * @param text 查找的文本(母串) * @param isDensityMatch 是否密集匹配 + * @return 关键字列表 */ - public List find(String text, boolean isDensityMatch) { - List ans = new ArrayList<>(); - Node p = root, k = null; + public List find(final String text, final boolean isDensityMatch) { + final List ans = new ArrayList<>(); + Node p = root, k; for (int i = 0, len = text.length(); i < len; i++) { - int ind = text.charAt(i); + final int ind = text.charAt(i); // 状态转移(沿着fail指针链接的链表,此处区别于DFA模型) while (p != null && p.next.get(ind) == null) { p = p.fail; diff --git a/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java b/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java deleted file mode 100644 index 9acdf13b9..000000000 --- a/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java +++ /dev/null @@ -1,225 +0,0 @@ -package cn.hutool.core.text.dfa; - -import cn.hutool.core.date.StopWatch; -import junit.framework.TestCase; -import org.junit.Assert; -import org.junit.Test; - -import java.util.List; -import java.util.stream.Collectors; - -public class AutomatonTest extends TestCase { - - /** - * 密集匹配 测试查找结果,并与WordTree对比效率 - */ - public void testFind() { - Automaton automaton = new Automaton(); - WordTree wordTree = new WordTree(); - automaton.insert("say", "her", "he", "she", "shr"); - automaton.buildAc(); - wordTree.addWords("say", "her", "he", "she", "shr"); - - StopWatch stopWatch = new StopWatch(); - String input = "sasherhsay"; - - stopWatch.start("automaton_char_find"); - List ans1 = automaton.find(input); - stopWatch.stop(); - assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); - assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex()); - assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex()); - assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex()); - assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex()); - assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex()); - assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex()); - assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex()); - assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex()); - - stopWatch.start("wordtree_char_find"); - List ans2 = wordTree.matchAll(input, -1, true, true); - stopWatch.stop(); - assertEquals("she,he,her,say", String.join(",", ans2)); - - System.out.println(stopWatch.prettyPrint()); - } - - /** - * 非密集匹配 测试查找结果,并与WordTree对比效率 - */ - public void testFindNotDensity() { - Automaton automaton = new Automaton(); - WordTree wordTree = new WordTree(); - automaton.insert("say", "her", "he", "she", "shr"); - automaton.buildAc(); - wordTree.addWords("say", "her", "he", "she", "shr"); - - StopWatch stopWatch = new StopWatch(); - String input = "sasherhsay"; - - stopWatch.start("automaton_char_find_not_density"); - List ans1 = automaton.find(input, false); - stopWatch.stop(); - assertEquals("she,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); - assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex()); - assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex()); - assertEquals(Integer.valueOf(7), ans1.get(1).getStartIndex()); - assertEquals(Integer.valueOf(9), ans1.get(1).getEndIndex()); - - stopWatch.start("wordtree_char_find_not_density"); - List ans2 = wordTree.matchAll(input, -1, false, true); - stopWatch.stop(); - assertEquals("she,say", String.join(",", ans2)); - - System.out.println(stopWatch.prettyPrint()); - } - - /** - * 密集匹配 测试建树和查找,并与WordTree对比效率 - */ - public void testBuildAndFind() { - StopWatch stopWatch = new StopWatch(); - String input = "sasherhsay"; - - stopWatch.start("automaton_char_buid_find"); - Automaton automatonLocal = new Automaton(); - automatonLocal.insert("say", "her", "he", "she", "shr"); - automatonLocal.buildAc(); - List ans1 = automatonLocal.find(input); - stopWatch.stop(); - assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); - assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex()); - assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex()); - assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex()); - assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex()); - assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex()); - assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex()); - assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex()); - assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex()); - - stopWatch.start("wordtree_char_build_find"); - WordTree wordTreeLocal = new WordTree(); - wordTreeLocal.addWords("say", "her", "he", "she", "shr"); - List ans2 = wordTreeLocal.matchAll(input, -1, true, true); - stopWatch.stop(); - assertEquals("she,he,her,say", String.join(",", ans2)); - - System.out.println(stopWatch.prettyPrint()); - } - - /** - * 密集匹配 构建树和查找 测试中文字符,并与wordTree对比效率 - */ - @Test - public void testBuildFindCnChar() { - StopWatch stopWatch = new StopWatch(); - String input = "赵啊三在做什么"; - - stopWatch.start("automaton_cn_build_find"); - Automaton automatonLocal = new Automaton(); - automatonLocal.insert("赵", "赵啊", "赵啊三"); - automatonLocal.buildAc(); - - final List result = automatonLocal.find(input); - stopWatch.stop(); - - Assert.assertEquals(3, result.size()); - Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); - assertEquals(Integer.valueOf(0), result.get(0).getStartIndex()); - assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); - assertEquals(Integer.valueOf(0), result.get(1).getStartIndex()); - assertEquals(Integer.valueOf(1), result.get(1).getEndIndex()); - assertEquals(Integer.valueOf(0), result.get(2).getStartIndex()); - assertEquals(Integer.valueOf(2), result.get(2).getEndIndex()); - - stopWatch.start("wordtree_cn_build_find"); - WordTree wordTreeLocal = new WordTree(); - wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); - - final List result1 = wordTreeLocal.matchAll(input, -1, true, true); - stopWatch.stop(); - - Assert.assertEquals(3, result1.size()); - Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1)); - - System.out.println(stopWatch.prettyPrint()); - - } - - /** - * 密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率 - */ - @Test - public void testFindCNChar() { - StopWatch stopWatch = new StopWatch(); - String input = "赵啊三在做什么"; - - Automaton automatonLocal = new Automaton(); - automatonLocal.insert("赵", "赵啊", "赵啊三"); - automatonLocal.buildAc(); - - stopWatch.start("automaton_cn_find"); - final List result = automatonLocal.find(input); - stopWatch.stop(); - - Assert.assertEquals(3, result.size()); - Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); - assertEquals(Integer.valueOf(0), result.get(0).getStartIndex()); - assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); - assertEquals(Integer.valueOf(0), result.get(1).getStartIndex()); - assertEquals(Integer.valueOf(1), result.get(1).getEndIndex()); - assertEquals(Integer.valueOf(0), result.get(2).getStartIndex()); - assertEquals(Integer.valueOf(2), result.get(2).getEndIndex()); - - WordTree wordTreeLocal = new WordTree(); - wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); - - stopWatch.start("wordtree_cn_find"); - final List result1 = wordTreeLocal.matchAllWords(input, -1, true, true).stream().map(FoundWord::getWord) - .collect(Collectors.toList()); - stopWatch.stop(); - - Assert.assertEquals(3, result1.size()); - Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1)); - - System.out.println(stopWatch.prettyPrint()); - - } - - /** - * 非密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率, - */ - @Test - public void testFindCNCharNotDensity() { - StopWatch stopWatch = new StopWatch(); - String input = "赵啊三在做什么"; - - Automaton automatonLocal = new Automaton(); - automatonLocal.insert("赵", "赵啊", "赵啊三"); - automatonLocal.buildAc(); - - stopWatch.start("automaton_cn_find_not_density"); - final List result = automatonLocal.find(input, false); - stopWatch.stop(); - - Assert.assertEquals(1, result.size()); - Assert.assertEquals("赵", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); - assertEquals(Integer.valueOf(0), result.get(0).getStartIndex()); - assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); - - WordTree wordTreeLocal = new WordTree(); - wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); - - stopWatch.start("wordtree_cn_find_not_density"); - final List result1 = - wordTreeLocal.matchAllWords(input, -1, false, true).stream().map(FoundWord::getWord) - .collect(Collectors.toList()); - stopWatch.stop(); - - Assert.assertEquals(1, result1.size()); - Assert.assertEquals("赵", String.join(",", result1)); - - System.out.println(stopWatch.prettyPrint()); - - } -} diff --git a/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java b/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java new file mode 100644 index 000000000..e38be87f3 --- /dev/null +++ b/hutool-core/src/test/java/cn/hutool/core/text/dfa/NFATest.java @@ -0,0 +1,228 @@ +package cn.hutool.core.text.dfa; + +import cn.hutool.core.date.StopWatch; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.stream.Collectors; + +public class NFATest { + + /** + * 密集匹配 测试查找结果,并与WordTree对比效率 + */ + @Test + public void testFind() { + final NFA NFA = new NFA(); + NFA.insert("say", "her", "he", "she", "shr"); + NFA.buildAc(); + + final WordTree wordTree = new WordTree(); + wordTree.addWords("say", "her", "he", "she", "shr"); + + final StopWatch stopWatch = new StopWatch(); + final String input = "sasherhsay"; + + stopWatch.start("automaton_char_find"); + final List ans1 = NFA.find(input); + stopWatch.stop(); + + Assert.assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + Assert.assertEquals(2, ans1.get(0).getBeginIndex().intValue()); + Assert.assertEquals(4, ans1.get(0).getEndIndex().intValue()); + Assert.assertEquals(3, ans1.get(1).getBeginIndex().intValue()); + Assert.assertEquals(4, ans1.get(1).getEndIndex().intValue()); + Assert.assertEquals(3, ans1.get(2).getBeginIndex().intValue()); + Assert.assertEquals(5, ans1.get(2).getEndIndex().intValue()); + Assert.assertEquals(7, ans1.get(3).getBeginIndex().intValue()); + Assert.assertEquals(9, ans1.get(3).getEndIndex().intValue()); + + stopWatch.start("wordtree_char_find"); + final List ans2 = wordTree.matchAll(input, -1, true, true); + stopWatch.stop(); + Assert.assertEquals("she,he,her,say", String.join(",", ans2)); + + //Console.log(stopWatch.prettyPrint()); + } + + /** + * 非密集匹配 测试查找结果,并与WordTree对比效率 + */ + @Test + public void testFindNotDensity() { + final NFA NFA = new NFA(); + NFA.insert("say", "her", "he", "she", "shr"); + NFA.buildAc(); + + final WordTree wordTree = new WordTree(); + wordTree.addWords("say", "her", "he", "she", "shr"); + + final StopWatch stopWatch = new StopWatch(); + final String input = "sasherhsay"; + + stopWatch.start("automaton_char_find_not_density"); + final List ans1 = NFA.find(input, false); + stopWatch.stop(); + Assert.assertEquals("she,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + Assert.assertEquals(2, ans1.get(0).getBeginIndex().intValue()); + Assert.assertEquals(4, ans1.get(0).getEndIndex().intValue()); + Assert.assertEquals(7, ans1.get(1).getBeginIndex().intValue()); + Assert.assertEquals(9, ans1.get(1).getEndIndex().intValue()); + + stopWatch.start("wordtree_char_find_not_density"); + final List ans2 = wordTree.matchAll(input, -1, false, true); + stopWatch.stop(); + Assert.assertEquals("she,say", String.join(",", ans2)); + + //Console.log(stopWatch.prettyPrint()); + } + + /** + * 密集匹配 测试建树和查找,并与WordTree对比效率 + */ + @Test + public void testBuildAndFind() { + final StopWatch stopWatch = new StopWatch(); + final String input = "sasherhsay"; + + stopWatch.start("automaton_char_buid_find"); + final NFA NFALocal = new NFA(); + NFALocal.insert("say", "her", "he", "she", "shr"); + NFALocal.buildAc(); + final List ans1 = NFALocal.find(input); + stopWatch.stop(); + + Assert.assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + Assert.assertEquals(2, ans1.get(0).getBeginIndex().intValue()); + Assert.assertEquals(4, ans1.get(0).getEndIndex().intValue()); + Assert.assertEquals(3, ans1.get(1).getBeginIndex().intValue()); + Assert.assertEquals(4, ans1.get(1).getEndIndex().intValue()); + Assert.assertEquals(3, ans1.get(2).getBeginIndex().intValue()); + Assert.assertEquals(5, ans1.get(2).getEndIndex().intValue()); + Assert.assertEquals(7, ans1.get(3).getBeginIndex().intValue()); + Assert.assertEquals(9, ans1.get(3).getEndIndex().intValue()); + + stopWatch.start("wordtree_char_build_find"); + final WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("say", "her", "he", "she", "shr"); + final List ans2 = wordTreeLocal.matchAll(input, -1, true, true); + stopWatch.stop(); + Assert.assertEquals("she,he,her,say", String.join(",", ans2)); + + //Console.log(stopWatch.prettyPrint()); + } + + /** + * 密集匹配 构建树和查找 测试中文字符,并与wordTree对比效率 + */ + @Test + public void buildFindCnCharTest() { + final StopWatch stopWatch = new StopWatch(); + final String input = "赵啊三在做什么"; + + stopWatch.start("automaton_cn_build_find"); + final NFA NFALocal = new NFA(); + NFALocal.insert("赵", "赵啊", "赵啊三"); + NFALocal.buildAc(); + + final List result = NFALocal.find(input); + stopWatch.stop(); + + Assert.assertEquals(3, result.size()); + Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + Assert.assertEquals(Integer.valueOf(0), result.get(0).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(1).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(1), result.get(1).getEndIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(2).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(2), result.get(2).getEndIndex()); + + stopWatch.start("wordtree_cn_build_find"); + final WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); + + final List result1 = wordTreeLocal.matchAll(input, -1, true, true); + stopWatch.stop(); + + Assert.assertEquals(3, result1.size()); + Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1)); + + //Console.log(stopWatch.prettyPrint()); + } + + /** + * 密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率 + */ + @Test + public void testFindCNChar() { + final StopWatch stopWatch = new StopWatch(); + final String input = "赵啊三在做什么"; + + final NFA NFALocal = new NFA(); + NFALocal.insert("赵", "赵啊", "赵啊三"); + NFALocal.buildAc(); + + stopWatch.start("automaton_cn_find"); + final List result = NFALocal.find(input); + stopWatch.stop(); + + Assert.assertEquals(3, result.size()); + Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + Assert.assertEquals(Integer.valueOf(0), result.get(0).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(1).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(1), result.get(1).getEndIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(2).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(2), result.get(2).getEndIndex()); + + final WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); + + stopWatch.start("wordtree_cn_find"); + final List result1 = wordTreeLocal.matchAllWords(input, -1, true, true).stream().map(FoundWord::getWord) + .collect(Collectors.toList()); + stopWatch.stop(); + + Assert.assertEquals(3, result1.size()); + Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1)); + + //Console.log(stopWatch.prettyPrint()); + } + + /** + * 非密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率, + */ + @Test + public void testFindCNCharNotDensity() { + final StopWatch stopWatch = new StopWatch(); + final String input = "赵啊三在做什么"; + + final NFA NFALocal = new NFA(); + NFALocal.insert("赵", "赵啊", "赵啊三"); + NFALocal.buildAc(); + + stopWatch.start("automaton_cn_find_not_density"); + final List result = NFALocal.find(input, false); + stopWatch.stop(); + + Assert.assertEquals(1, result.size()); + Assert.assertEquals("赵", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + Assert.assertEquals(Integer.valueOf(0), result.get(0).getBeginIndex()); + Assert.assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); + + final WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); + + stopWatch.start("wordtree_cn_find_not_density"); + final List result1 = + wordTreeLocal.matchAllWords(input, -1, false, true).stream().map(FoundWord::getWord) + .collect(Collectors.toList()); + stopWatch.stop(); + + Assert.assertEquals(1, result1.size()); + Assert.assertEquals("赵", String.join(",", result1)); + + //Console.log(stopWatch.prettyPrint()); + } +}