从语料中自动挖掘短语
https://github.com/shangjingbo1226/AutoPhrase
预测搜索短语可采用FST结构,
https://blog.csdn.net/vivian_ll/article/details/95049652
https://www.youtube.com/watch?v=3kQyYbTyXfc
https://www.youtube.com/watch?v=k97WC5ijB7U
https://speakerdeck.com/mschoch/finite-state-transducers-in-go
package core.index; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.*; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class FstSearch { // 输入保证字典序 public static FST<Long> buildFst() throws IOException { String inputValues[] = {"cat house", "dog", "dog house", "dogs house", "dogs houses"}; long outputValues[] = {5, 7, 8, 12, 16}; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (int i = 0; i < inputValues.length; i++) { BytesRef scratchBytes = new BytesRef(inputValues[i]); builder.add(Util.toIntsRef(scratchBytes, scratchInts), outputValues[i]); } return builder.finish(); } public static void main(String[] args) throws IOException { FST<Long> fst = FstSearch.buildFst(); String s = ""; s = "do"; System.out.println(s + " " + search(fst, s)); s = "dog"; System.out.println(s + " " + search(fst, s)); s = "dog house"; System.out.println(s + " " + search(fst, s)); s = "dogs house"; System.out.println(s + " " + search(fst, s)); s = "dogs houses"; System.out.println(s + " " + search(fst, s)); s = "c"; System.out.println(s + " " + search(fst, s)); s = "ca"; System.out.println(s + " " + search(fst, s)); s = "cat"; System.out.println(s + " " + search(fst, s)); s = "cat houses"; System.out.println(s + " " + search(fst, s)); } /** * 当前是英文 所以一个字节 和字符的偏移量等价了,其他语言需要修改。 */ public static <T> List<Integer> search(FST<T> fst, String input) throws IOException { List<Integer> offsets = new ArrayList<>(); BytesRef bytesRef = new BytesRef(input); assert fst.inputType == FST.INPUT_TYPE.BYTE1; FST.BytesReader fstReader = fst.getBytesReader(); FST.Arc<T> arc = fst.getFirstArc(new FST.Arc()); FST.Arc<T> holder = new FST.Arc<>(); for (int i = 0; i < bytesRef.length; ++i) { FST.Arc<T> targetArc = fst.findTargetArc(bytesRef.bytes[i + bytesRef.offset] & 255, arc, holder, fstReader); // (arc.target == -1 && arc.isFinal() && arc!=holder) { if get all then remove -1 like below if (arc.isFinal() && arc != holder) { offsets.add(i); } if (targetArc == null) { return offsets; } arc.copyFrom(holder); } if (arc.isFinal()) { offsets.add(bytesRef.length); } return offsets; } }