Lucene的DocFieldProcessor类
DocFieldProcessor类的任务
1 按顺序存储所有的field和对应的fieldinfo
2 为当前这篇doc的field按照fieldname来建立hash索引
3 调用InvertedDocConsumer类(抽象),对field的内容分词和建立内存索引
DocFieldProcessor类主要用到的其他类
DocFieldProcessorPerField类的对象负责存放一个fieldinfo和其对应的field(可能是多个,他们的fieldinfo相同),next成员可以指向下一个DocFieldProcessorPerField类对象,构成链表(用于解决fieldhash冲突)
DocInverterPerField是DocFieldProcessorPerField内用到的类,负责解析同一个fieldinfo里的field,建立索引
final class DocFieldProcessor extends DocConsumer { final DocFieldConsumer consumer; final StoredFieldsConsumer storedConsumer; final Codec codec; // 存储doc里所有的DocFieldProcessorPerField(这个类里面会存放多个fieldinfo相同的field),位置按顺序存储,一个位置里面放一个 DocFieldProcessorPerField[] fields = new DocFieldProcessorPerField[1]; int fieldCount; // 存储doc里所有的DocFieldProcessorPerField(这个类里面会存放多个fieldinfo相同的field),位置按fieldname的hash来存放,一个位置可能匹配多个,以链表结构存放 DocFieldProcessorPerField[] fieldHash = new DocFieldProcessorPerField[2]; int hashMask = 1; int totalFieldCount; int fieldGen; //浅拷贝 要分析的doument final DocumentsWriterPerThread.DocState docState; final Counter bytesUsed;
DocFieldProcessor ::processDocument(FieldInfos.Builder fieldInfos)是DocFieldProcessor类主要功能的实现函数
public void processDocument(FieldInfos.Builder fieldInfos) throws IOException { consumer.startDocument(); storedConsumer.startDocument(); fieldCount = 0; final int thisFieldGen = fieldGen++; //循环doc里的field,合并同fieldinfo的field进入一个DocFieldProcessorPerField,并且如果此轮循环有新创建的DocFieldProcessorPerField对象,则加入到顺序数组和hash数组里面 for(IndexableField field : docState.doc) { final String fieldName = field.name(); //计算fieldname在hash数组里对应的位置 final int hashPos = fieldName.hashCode() & hashMask; DocFieldProcessorPerField fp = fieldHash[hashPos]; //查找对应位置下fieldname符合当前field的DocFieldProcessorPerField对象 while(fp != null && !fp.fieldInfo.name.equals(fieldName)) { fp = fp.next; } if (fp == null) { //表示该fieldinfo是第一次出现,把field的info信息添加到fieldinfos里面,并且创建对应的fieldinfo对象 FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.fieldType()); //创建一个DocFieldProcessorPerField fp = new DocFieldProcessorPerField(this, fi); //加入到hash链表里面 fp.next = fieldHash[hashPos]; fieldHash[hashPos] = fp; totalFieldCount++; //hash表快满了,重新做hash if (totalFieldCount >= fieldHash.length/2) { rehash(); } } else { //更新fieldinfo对象 FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.fieldType()); assert fi == fp.fieldInfo : "should only have updated an existing FieldInfo instance"; } if (thisFieldGen != fp.lastGen) { //说明这个fieldinfo是第一次出现,即本次循环new了DocFieldProcessorPerField fp.fieldCount = 0; //判断顺序数组是不是满了 if (fieldCount == fields.length) { final int newSize = fields.length*2; DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize]; System.arraycopy(fields, 0, newArray, 0, fieldCount); fields = newArray; } fields[fieldCount++] = fp; fp.lastGen = thisFieldGen; } //把field加入到fp中,同fieldinfo的field会加入到同一个fp里面 fp.addField(field); storedConsumer.addField(docState.docID, field, fp.fieldInfo); } //遍历顺序数组里的DocFieldProcessorPerField,调用DocInverterPerField对其里面的field进行分词建立索引之类的操作 ArrayUtil.introSort(fields, 0, fieldCount, fieldsComp); for(int i=0;i<fieldCount;i++) { final DocFieldProcessorPerField perField = fields[i]; perField.consumer.processFields(perField.fields, perField.fieldCount); } }