Lucene 自定义分词器

自定义分词器其实就是重写Tokenizer里面的incrementToken 和Analyzer里面的 createComponents方法，也可以自定义一个attrbitue 在add方法中添加Attribute，然后用getAttrbiute获取，借此可以封装自己想要的属性属package com.tianditu.analyzer;
import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token.TokenAttributeFactory;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource.AttributeFactory;

import com.tianditu.attribute.impl.CityCodeAttribute;
import com.tianditu.attribute.impl.TokenTypeAttribute;
import com.tianditu.domain.config.AdminData;
import com.tianditu.domain.config.AdminWord;
import com.tianditu.domain.road.Road;
import com.tianditu.domain.road.RoadTree;
import com.tianditu.util.StringTool;
import com.tianditu.util.Util;

public class SearchWordAnalyzer extends Analyzer {
    private Road nameForStore;// 名称字典
    private Road addressForStor;// 地址字典
    private Road roadForStore;// 道路字典
    private Road adminForStore;// 行政区划字典

    public SearchWordAnalyzer(Road admin, Road name, Road address, Road road) {
        this.adminForStore = admin;
        this.nameForStore = name;
        this.roadForStore = road;
        this.addressForStor = address;
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName,
            Reader reader) {

        AttributeFactory factory = TokenAttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
        SearchWordTokenizer tokenizer = new SearchWordTokenizer(factory,
                reader, adminForStore, nameForStore, addressForStor,
                roadForStore);
        TokenStreamComponents components = new TokenStreamComponents(tokenizer);

        return components;
    }

}

class SearchWordTokenizer extends Tokenizer {
    @Override
    public void reset() throws IOException {//需要重写reset
        // TODO Auto-generated method stub
        super.reset();
        this.upto = 0;
        this.i = 0;
        this.done = false;
        this.attr = addAttribute(CharTermAttribute.class);
        this.offset = addAttribute(OffsetAttribute.class);
        this.type = addAttribute(TypeAttribute.class);
        ioBuffer = new char[IO_BUFFER];
    }

    final String ADMIN_TYPE = "0";// 分词的属性
    final String ROAD_TYPE = "1";
    final String ADRES_TYPE = "2";
    final String CHAR_TYPE = "3";
    final String NUMBER_TYPE = "4";
    final String NAME_TYPE = "5";
    final String NULL_TYPE = "6";

    private CharTermAttribute attr;// 存储词条
    private OffsetAttribute offset;
    private TypeAttribute type; // 分词类型
    private TokenTypeAttribute tokenType; // 分词此条属性
    private CityCodeAttribute cityCode;// 行政区划
    private Road nameForStore;// 名称字典
    private Road addressForStor;// 地址字典
    private Road roadForStore;// 道路字典
    private Road adminForStore; //
    private static final int IO_BUFFER = 4096;
    // private static final int MIN_INITIAL_CAPACITY = 1 << 4;
    private char[] ioBuffer = new char[IO_BUFFER];
    private boolean done;//
    private int upto = 0;
    private int i = 0;
    private int max_int;

    protected SearchWordTokenizer(AttributeFactory factory, Reader input) {

        super(factory, input);
        this.attr = addAttribute(CharTermAttribute.class);
        this.offset = addAttribute(OffsetAttribute.class);
        this.type = addAttribute(TypeAttribute.class);
        this.cityCode = addAttribute(CityCodeAttribute.class);//自定义的两个attrbute 很简单
        this.tokenType = addAttribute(TokenTypeAttribute.class);
        this.done = false;
    }

    public SearchWordTokenizer(AttributeFactory factory, Reader input,
            Road admin, Road name, Road address, Road road) {
        this(factory, input);
        this.nameForStore = name;
        this.roadForStore = road;
        this.addressForStor = address;
        this.adminForStore = admin;
    }

    public void resizeBufferIo(int newSize) {
//扩容 相当于是 把numElements 扩容到所占的二进制位数的最大值 如 4-> 0100扩容到16 17 -> 32

        int initialCapacity = ioBuffer.length;
        if (initialCapacity <= newSize) {
            initialCapacity = newSize;
            initialCapacity |= (initialCapacity >>> 1);
            initialCapacity |= (initialCapacity >>> 2);
            initialCapacity |= (initialCapacity >>> 4);
            initialCapacity |= (initialCapacity >>> 8);
            initialCapacity |= (initialCapacity >>> 16);
            initialCapacity++;

            if (initialCapacity < 0) 
                initialCapacity >>>= 1;

            final char[] temp = new char[initialCapacity];
            System.arraycopy(ioBuffer, 0, temp, 0, ioBuffer.length);
            ioBuffer = temp;
        }

    }

    @Override
    public boolean incrementToken() throws IOException {
        if (!done) {
            clearAttributes();
            done = true;
            upto = 0;
            // i = 0;
            int length = 0;
            while (true) {
                length = input.read(ioBuffer, upto, ioBuffer.length - upto);// 读取upto
                                                                            // 到最后一个位置的字符串
                if (length == -1) {
                    break; // 读完了
                }
                upto += length;
                if (upto == ioBuffer.length) {
                    resizeBufferIo(upto);
                }
                if (length > max_int)
                    max_int = length;
            }
            if (i < max_int) {
                char[] matchChar = new char[max_int - i];
                System.arraycopy(ioBuffer, i, matchChar, 0, max_int - i);
                String mathch = new String(matchChar);
                String matchString = getMathchWords(mathch);
                // System.out.println(type.type());
                if (matchString != null) {
                    System.out.println(matchString);
                    attr.copyBuffer(ioBuffer, i, matchString.length());
                    offset.setOffset(i, i + matchString.length());
                    i += matchString.length();

                } else {
                    int leng = 1;
                    char[] number = Util.copyChar(ioBuffer, i, leng);
                    boolean isNumber = false;
                    while (Util.isNumber(new String(number))
                            && (leng + i) < max_int) {
                        isNumber = true;
                        leng++;
                        number = Util.copyChar(ioBuffer, i, leng);
                    }
                    if (isNumber) {
                        leng--;
                        type.setType(NUMBER_TYPE);
                    } else {
                        type.setType(CHAR_TYPE);
                    }
                    attr.copyBuffer(ioBuffer, i, leng);
                    offset.setOffset(i, i + leng);
                    i += leng;
                }
                done = (i == length);
                return true;

            }

        }

        return false;
    }
    
    /**
     * 识别规则  
     * 行政区划  道路 地址 名称   最长匹配
     * @param words
     * @return
     */
    String getMathchWords(String words) {
        String match = null;
        if (words.length() > 2) {
            words = StringTool.CharStandardization(words);
            RoadTree admin = adminForStore.getTree(words, true);
            RoadTree name = nameForStore.getTree(words, true);
            RoadTree address = addressForStor.getTree(words, true);
            RoadTree road = roadForStore.getTree(words, true);
            int length = 0 ; 
            if (admin != null && admin.next != null
                    && admin.next.cityCode != null) {
                // 优先识别行政区划
                match = adminForStore.getRoadName(admin);
                length = Util.getMaxLengthWithoutNumber(match);
                type.setType(ADMIN_TYPE);
                cityCode.setCode(admin.next.cityCode);
                if(admin.next.cityCode != null && match.length() > 2 && AdminWord.getAdminWord(match) != null)
                    return match ;
                
            }
            if (road != null && road.next != null) {
                String match3 = roadForStore.getRoadName(road);
                length = Util.getMaxLengthWithoutNumber(match3);
                if (match != null) {
                    if (length > match.length()) {
                        match = match3;
                        type.setType(ROAD_TYPE);
                    }

                } else {
                    match = match3;
                    type.setType(ROAD_TYPE);
                }
            }

            if (address != null && address.next != null) {
                String match2 = addressForStor.getRoadName(address);
                length = Util.getMaxLengthWithoutNumber(match2);
                if (match != null) {
                    if (length > match.length()) {
                        match = match2;
                        type.setType(ADRES_TYPE);
                    }

                } else {
                    match = match2;
                    type.setType(ADRES_TYPE);
                }
            }

            if (name != null && name.next != null) {
                String match0 = nameForStore.getRoadName(name);
                length = Util.getMaxLengthWithoutNumber(match0);
                if (match != null) {
                    if (length > match.length()) {
                        match = match0;
                        type.setType(NAME_TYPE);
                    }

                } else {
                    match = match0;
                    type.setType(NAME_TYPE);
                }
            }
            if (match == null)
                return null;


            if (length <= 2) {
                type.setType("");
                return null;
            }
        }
        return match;

    }

}
里面字典是我自己写的一个Tire，里面存储字典，使用最长匹配规则
输入:西城区天桥街道西城阡儿路71号西城永安路社区 ,字典不全，所以只识别到这些东西
posted @ 2017-12-02 19:57 王南辉阅读(324) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
王南辉

Lucene 自定义分词器

公告