从语料中自动挖掘短语

从语料中自动挖掘短语

https://github.com/shangjingbo1226/AutoPhrase

预测搜索短语可采用FST结构,

https://blog.csdn.net/vivian_ll/article/details/95049652

https://www.youtube.com/watch?v=3kQyYbTyXfc

https://www.youtube.com/watch?v=k97WC5ijB7U

 https://speakerdeck.com/mschoch/finite-state-transducers-in-go

package core.index;


import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.*;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


public class FstSearch {

    // 输入保证字典序
    public static FST<Long> buildFst() throws IOException {
        String inputValues[] = {"cat house", "dog", "dog house", "dogs house", "dogs houses"};
        long outputValues[] = {5, 7, 8, 12, 16};
        PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
        Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);

        IntsRefBuilder scratchInts = new IntsRefBuilder();
        for (int i = 0; i < inputValues.length; i++) {
            BytesRef scratchBytes = new BytesRef(inputValues[i]);
            builder.add(Util.toIntsRef(scratchBytes, scratchInts), outputValues[i]);
        }
        return builder.finish();
    }

    public static void main(String[] args) throws IOException {
        FST<Long> fst = FstSearch.buildFst();

        String s = "";
        s = "do";
        System.out.println(s + "			" + search(fst, s));
        s = "dog";
        System.out.println(s + "			" + search(fst, s));
        s = "dog house";
        System.out.println(s + "	" + search(fst, s));
        s = "dogs house";
        System.out.println(s + "	" + search(fst, s));
        s = "dogs houses";
        System.out.println(s + "	" + search(fst, s));
        s = "c";
        System.out.println(s + "			" + search(fst, s));
        s = "ca";
        System.out.println(s + "			" + search(fst, s));
        s = "cat";
        System.out.println(s + "			" + search(fst, s));
        s = "cat houses";
        System.out.println(s + "	" + search(fst, s));

    }

    /**
     * 当前是英文 所以一个字节 和字符的偏移量等价了,其他语言需要修改。
     */
    public static <T> List<Integer> search(FST<T> fst, String input) throws IOException {
        List<Integer> offsets = new ArrayList<>();

        BytesRef bytesRef = new BytesRef(input);
        assert fst.inputType == FST.INPUT_TYPE.BYTE1;

        FST.BytesReader fstReader = fst.getBytesReader();
        FST.Arc<T> arc = fst.getFirstArc(new FST.Arc());
        FST.Arc<T> holder = new FST.Arc<>();
        for (int i = 0; i < bytesRef.length; ++i) {
            FST.Arc<T> targetArc = fst.findTargetArc(bytesRef.bytes[i + bytesRef.offset] & 255, arc, holder, fstReader);

            // (arc.target == -1 && arc.isFinal() && arc!=holder) { if get all then remove -1 like below
            if (arc.isFinal() && arc != holder) {
                offsets.add(i);
            }
            if (targetArc == null) {
                return offsets;
            }
            arc.copyFrom(holder);
        }
        if (arc.isFinal()) {
            offsets.add(bytesRef.length);
        }
        return offsets;
    }
}

  

原文地址:https://www.cnblogs.com/startnow/p/14092245.html