Lucene 自定义分词器

自定义分词器其实就是重写Tokenizer里面的incrementToken 和Analyzer里面的 createComponents方法,也可以自定义一个attrbitue 在add方法中添加Attribute,然后用getAttrbiute获取,借此可以封装自己想要的属性属package com.tianditu.analyzer;

import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token.TokenAttributeFactory;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource.AttributeFactory;

import com.tianditu.attribute.impl.CityCodeAttribute;
import com.tianditu.attribute.impl.TokenTypeAttribute;
import com.tianditu.domain.config.AdminData;
import com.tianditu.domain.config.AdminWord;
import com.tianditu.domain.road.Road;
import com.tianditu.domain.road.RoadTree;
import com.tianditu.util.StringTool;
import com.tianditu.util.Util;

public class SearchWordAnalyzer extends Analyzer {
    private Road nameForStore;// 名称字典
    private Road addressForStor;// 地址字典
    private Road roadForStore;// 道路字典
    private Road adminForStore;// 行政区划字典

    public SearchWordAnalyzer(Road admin, Road name, Road address, Road road) {
        this.adminForStore = admin;
        this.nameForStore = name;
        this.roadForStore = road;
        this.addressForStor = address;
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName,
            Reader reader) {

        AttributeFactory factory = TokenAttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
        SearchWordTokenizer tokenizer = new SearchWordTokenizer(factory,
                reader, adminForStore, nameForStore, addressForStor,
                roadForStore);
        TokenStreamComponents components = new TokenStreamComponents(tokenizer);

        return components;
    }

}

class SearchWordTokenizer extends Tokenizer {
    @Override
    public void reset() throws IOException {//需要重写reset
        // TODO Auto-generated method stub
        super.reset();
        this.upto = 0;
        this.i = 0;
        this.done = false;
        this.attr = addAttribute(CharTermAttribute.class);
        this.offset = addAttribute(OffsetAttribute.class);
        this.type = addAttribute(TypeAttribute.class);
        ioBuffer = new char[IO_BUFFER];
    }

    final String ADMIN_TYPE = "0";// 分词的属性
    final String ROAD_TYPE = "1";
    final String ADRES_TYPE = "2";
    final String CHAR_TYPE = "3";
    final String NUMBER_TYPE = "4";
    final String NAME_TYPE = "5";
    final String NULL_TYPE = "6";
private CharTermAttribute attr;// 存储词条 private OffsetAttribute offset; private TypeAttribute type; // 分词类型 private TokenTypeAttribute tokenType; // 分词此条属性 private CityCodeAttribute cityCode;// 行政区划 private Road nameForStore;// 名称字典 private Road addressForStor;// 地址字典 private Road roadForStore;// 道路字典 private Road adminForStore; // private static final int IO_BUFFER = 4096; // private static final int MIN_INITIAL_CAPACITY = 1 << 4; private char[] ioBuffer = new char[IO_BUFFER]; private boolean done;// private int upto = 0; private int i = 0; private int max_int; protected SearchWordTokenizer(AttributeFactory factory, Reader input) { super(factory, input); this.attr = addAttribute(CharTermAttribute.class); this.offset = addAttribute(OffsetAttribute.class); this.type = addAttribute(TypeAttribute.class); this.cityCode = addAttribute(CityCodeAttribute.class);//自定义的两个attrbute 很简单 this.tokenType = addAttribute(TokenTypeAttribute.class); this.done = false; } public SearchWordTokenizer(AttributeFactory factory, Reader input, Road admin, Road name, Road address, Road road) { this(factory, input); this.nameForStore = name; this.roadForStore = road; this.addressForStor = address; this.adminForStore = admin; } public void resizeBufferIo(int newSize) { //扩容 相当于是 把numElements 扩容到所占的二进制位数的最大值 如 4-> 0100扩容到16 17 -> 32
int initialCapacity = ioBuffer.length; if (initialCapacity <= newSize) { initialCapacity = newSize; initialCapacity |= (initialCapacity >>> 1); initialCapacity |= (initialCapacity >>> 2); initialCapacity |= (initialCapacity >>> 4); initialCapacity |= (initialCapacity >>> 8); initialCapacity |= (initialCapacity >>> 16); initialCapacity++; if (initialCapacity < 0) initialCapacity >>>= 1; final char[] temp = new char[initialCapacity]; System.arraycopy(ioBuffer, 0, temp, 0, ioBuffer.length); ioBuffer = temp; } } @Override public boolean incrementToken() throws IOException { if (!done) { clearAttributes(); done = true; upto = 0; // i = 0; int length = 0; while (true) { length = input.read(ioBuffer, upto, ioBuffer.length - upto);// 读取upto // 到最后一个位置的字符串 if (length == -1) { break; // 读完了 } upto += length; if (upto == ioBuffer.length) { resizeBufferIo(upto); } if (length > max_int) max_int = length; } if (i < max_int) { char[] matchChar = new char[max_int - i]; System.arraycopy(ioBuffer, i, matchChar, 0, max_int - i); String mathch = new String(matchChar); String matchString = getMathchWords(mathch); // System.out.println(type.type()); if (matchString != null) { System.out.println(matchString); attr.copyBuffer(ioBuffer, i, matchString.length()); offset.setOffset(i, i + matchString.length()); i += matchString.length(); } else { int leng = 1; char[] number = Util.copyChar(ioBuffer, i, leng); boolean isNumber = false; while (Util.isNumber(new String(number)) && (leng + i) < max_int) { isNumber = true; leng++; number = Util.copyChar(ioBuffer, i, leng); } if (isNumber) { leng--; type.setType(NUMBER_TYPE); } else { type.setType(CHAR_TYPE); } attr.copyBuffer(ioBuffer, i, leng); offset.setOffset(i, i + leng); i += leng; } done = (i == length); return true; } } return false; } /** * 识别规则 * 行政区划 道路 地址 名称 最长匹配 * @param words * @return */ String getMathchWords(String words) { String match = null; if (words.length() > 2) { words = StringTool.CharStandardization(words); RoadTree admin = adminForStore.getTree(words, true); RoadTree name = nameForStore.getTree(words, true); RoadTree address = addressForStor.getTree(words, true); RoadTree road = roadForStore.getTree(words, true); int length = 0 ; if (admin != null && admin.next != null && admin.next.cityCode != null) { // 优先识别行政区划 match = adminForStore.getRoadName(admin); length = Util.getMaxLengthWithoutNumber(match); type.setType(ADMIN_TYPE); cityCode.setCode(admin.next.cityCode); if(admin.next.cityCode != null && match.length() > 2 && AdminWord.getAdminWord(match) != null) return match ; } if (road != null && road.next != null) { String match3 = roadForStore.getRoadName(road); length = Util.getMaxLengthWithoutNumber(match3); if (match != null) { if (length > match.length()) { match = match3; type.setType(ROAD_TYPE); } } else { match = match3; type.setType(ROAD_TYPE); } } if (address != null && address.next != null) { String match2 = addressForStor.getRoadName(address); length = Util.getMaxLengthWithoutNumber(match2); if (match != null) { if (length > match.length()) { match = match2; type.setType(ADRES_TYPE); } } else { match = match2; type.setType(ADRES_TYPE); } } if (name != null && name.next != null) { String match0 = nameForStore.getRoadName(name); length = Util.getMaxLengthWithoutNumber(match0); if (match != null) { if (length > match.length()) { match = match0; type.setType(NAME_TYPE); } } else { match = match0; type.setType(NAME_TYPE); } } if (match == null) return null; if (length <= 2) { type.setType(""); return null; } } return match; } }
里面字典是我自己写的一个Tire,里面存储字典,使用最长匹配规则
输入:西城区天桥街道西城阡儿路71号西城永安路社区 ,字典不全,所以只识别到这些东西





posted @ 2017-12-02 19:57  王南辉  阅读(324)  评论(0编辑  收藏  举报