由于不知道最终会分成几个词,所以定义数组时只能用最大的容量 MAX_SEGMENT_NUM
而SharpICTCLAS中大量使用了 List<int[]>
的方式记录结果 ,范型的List首先可以确保结果集的数量可以动态调整而不用事先定义,另外每个结果的数组长度也可各不相同。
WordResult[] tmpResult;
WordLinkedArray linkedArray;
m_pWordSeg = new List<WordResult[]>();
m_graphOptimum = new RowFirstDynamicArray<ChainContent>();
atomSegment = AtomSegment(sSentence);
segGraph = GenerateWordNet(atomSegment, coreDict);
biGraphResult = BiGraphGenerate(segGraph, smoothPara, biDict, coreDict);
//---N 最短路径计算出多个分词方案
NShortPath.Calculate(biGraphResult, nKind);
List<int[]> spResult = NShortPath.GetNPaths(Predefine.MAX_SEGMENT_NUM);
for (int i = 0; i < spResult.Count; i++)
linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
tmpResult = GenerateWord(spResult[i], linkedArray, m_graphOptimum);
if (tmpResult != null)
return m_pWordSeg.Count;
//==== 原子切分:
始##始, 他, 在, 1, 月, 份, 大, 会, 上, 说, 的, 确, 实, 在, 理, 末##末,
//==== 生成 segGraph:
row: 0, col: 1, eWeight: 329805.00, nPOS: 1, sWord:始##始
row: 1, col: 2, eWeight: 19823.00, nPOS: 0, sWord:他
row: 2, col: 3, eWeight: 78484.00, nPOS: 0, sWord:在
row: 3, col: 4, eWeight: 0.00, nPOS: -27904, sWord:未##数
row: 4, col: 5, eWeight: 1900.00, nPOS: 0, sWord:月
row: 4, col: 6, eWeight: 11.00, nPOS: 28160, sWord:月份
row: 5, col: 6, eWeight: 1234.00, nPOS: 0, sWord:份
row: 6, col: 7, eWeight: 14536.00, nPOS: 0, sWord:大
row: 6, col: 8, eWeight: 1333.00, nPOS: 28160, sWord:大会
row: 7, col: 8, eWeight: 6136.00, nPOS: 0, sWord:会
row: 7, col: 9, eWeight: 469.00, nPOS: 0, sWord:会上
row: 8, col: 9, eWeight: 23706.00, nPOS: 0, sWord:上
row: 9, col: 10, eWeight: 17649.00, nPOS: 0, sWord:说
row: 10, col: 11, eWeight: 358156.00, nPOS: 0, sWord:的
row: 10, col: 12, eWeight: 210.00, nPOS: 25600, sWord:的确
row: 11, col: 12, eWeight: 181.00, nPOS: 0, sWord:确
row: 11, col: 13, eWeight: 361.00, nPOS: 0, sWord:确实
row: 12, col: 13, eWeight: 357.00, nPOS: 0, sWord:实
row: 12, col: 14, eWeight: 295.00, nPOS: 0, sWord:实在
row: 13, col: 14, eWeight: 78484.00, nPOS: 0, sWord:在
row: 13, col: 15, eWeight: 3.00, nPOS: 24832, sWord:在理
row: 14, col: 15, eWeight: 129.00, nPOS: 0, sWord:理
row: 15, col: 16, eWeight:2079997.00, nPOS: 4, sWord:末##末
//==== 生成 biSegGraph:
row: 0, col: 1, eWeight: 3.37, nPOS: 1, sWord:始##始@他
row: 1, col: 2, eWeight: 3.37, nPOS: 0, sWord:他@在
row: 2, col: 3, eWeight: 3.74, nPOS: 0, sWord:在@未##数
row: 3, col: 4, eWeight: -27898.79, nPOS: -27904, sWord:未##数@月
row: 3, col: 5, eWeight: -27898.75, nPOS: -27904, sWord:未##数@月份
row: 4, col: 6, eWeight: 9.33, nPOS: 0, sWord:月@份
row: 5, col: 7, eWeight: 13.83, nPOS: 28160, sWord:月份@大
row: 6, col: 7, eWeight: 9.76, nPOS: 0, sWord:份@大
row: 5, col: 8, eWeight: 13.83, nPOS: 28160, sWord:月份@大会
row: 6, col: 8, eWeight: 9.76, nPOS: 0, sWord:份@大会
row: 7, col: 9, eWeight: 7.30, nPOS: 0, sWord:大@会
row: 7, col: 10, eWeight: 7.30, nPOS: 0, sWord:大@会上
row: 8, col: 11, eWeight: 2.11, nPOS: 28160, sWord:大会@上
row: 9, col: 11, eWeight: 8.16, nPOS: 0, sWord:会@上
row: 10, col: 12, eWeight: 3.42, nPOS: 0, sWord:会上@说
row: 11, col: 12, eWeight: 4.07, nPOS: 0, sWord:上@说
row: 12, col: 13, eWeight: 4.05, nPOS: 0, sWord:说@的
row: 12, col: 14, eWeight: 7.11, nPOS: 0, sWord:说@的确
row: 13, col: 15, eWeight: 4.10, nPOS: 0, sWord:的@确
row: 13, col: 16, eWeight: 4.10, nPOS: 0, sWord:的@确实
row: 14, col: 17, eWeight: 11.49, nPOS: 25600, sWord:的确@实
row: 15, col: 17, eWeight: 11.63, nPOS: 0, sWord:确@实
row: 14, col: 18, eWeight: 11.49, nPOS: 25600, sWord:的确@实在
row: 15, col: 18, eWeight: 11.63, nPOS: 0, sWord:确@实在
row: 16, col: 19, eWeight: 3.92, nPOS: 0, sWord:确实@在
row: 17, col: 19, eWeight: 10.98, nPOS: 0, sWord:实@在
row: 16, col: 20, eWeight: 10.97, nPOS: 0, sWord:确实@在理
row: 17, col: 20, eWeight: 10.98, nPOS: 0, sWord:实@在理
row: 18, col: 21, eWeight: 11.17, nPOS: 0, sWord:实在@理
row: 19, col: 21, eWeight: 5.62, nPOS: 0, sWord:在@理
row: 20, col: 22, eWeight: 14.30, nPOS: 24832, sWord:在理@末##末
row: 21, col: 22, eWeight: 11.95, nPOS: 0, sWord:理@末##末
//==== NShortPath 初步切分的到的 N 个结果:
始##始, 他, 在, 1, 月份, 大会, 上, 说, 的, 确实, 在, 理, 末##末,
始##始, 他, 在, 1, 月份, 大会, 上, 说, 的, 确实, 在理, 末##末,
始##始, 他, 在, 1, 月份, 大, 会上, 说, 的, 确实, 在, 理, 末##末,
始##始, 他, 在, 1, 月, 份, 大会, 上, 说, 的, 确实, 在, 理, 末##末,
始##始, 他, 在, 1, 月份, 大, 会上, 说, 的, 确实, 在理, 末##末,
//==== 经过数字、日期合并等策略处理后的 N 个结果:
始##始, 他, 在, 1月份, 大会, 上, 说, 的, 确实, 在, 理, 末##末,
始##始, 他, 在, 1月份, 大会, 上, 说, 的, 确实, 在理, 末##末,
始##始, 他, 在, 1月份, 大, 会上, 说, 的, 确实, 在, 理, 末##末,
始##始, 他, 在, 1月, 份, 大会, 上, 说, 的, 确实, 在, 理, 末##末,
始##始, 他, 在, 1月份, 大, 会上, 说, 的, 确实, 在理, 末##末,
原子分词看起来应当是程序中最简单的部分,无非是将汉字逐一分开。但是也是最值得改进的地方。SharpICTCLAS目前仍然沿用了原有ICTCLAS的算法并做了微小调整。但我对于 这种原子分词方法不太满意,如果有机会,可以考虑使用一系列正则表达式将某些“原子”词单独摘出来。比如“甲子”、“乙亥”等年份信息属于原子信息,还有URL、Email等都可以预先进行原子识别,这可以大大简化后续工作。因此日后可以考虑这方面的处理。
BiPath2UniPath(nSegRoute[i]); //Path convert to unipath
GenerateWord(nSegRoute, i); //Gernerate word according the Segmentation route
BiPath:(0, 1, 2, 3, 6, 9, 11, 12)
0 1 2 3 4 5 6 7 8 9 10 11 12
始##始 他 说 的 的确 确 确实 实 实在 在 在理 理 末##末
UniPath:(0, 1, 2, 3, 4, 6, 7, 8)
0 1 2 3 4 5 6 7 8
始##始 他 说 的 确 实 在 理 末##末
linkedArray = BiPath2LinkedArray(spResult[i], segGraph, atomSegment);
这样改造后,还使得原有ICTCLAS中 int *m_npWordPosMapTable;
bool CSegment::GenerateWord(int **nSegRoute, int nIndex)
unsigned int i = 0, k = 0;
int j, nStartVertex, nEndVertex, nPOS;
char sAtom[WORD_MAXLENGTH], sNumCandidate[100], sCurWord[100];
while (nSegRoute[nIndex][i] != - 1 && nSegRoute[nIndex][i + 1] != - 1 &&
nSegRoute[nIndex][i] < nSegRoute[nIndex][i + 1])
nStartVertex = nSegRoute[nIndex][i];
j = nStartVertex; //Set the start vertex
nEndVertex = nSegRoute[nIndex][i + 1]; //Set the end vertex
nPOS = 0;
m_graphSeg.m_segGraph.GetElement(nStartVertex, nEndVertex, &fValue, &nPOS);
sAtom[0] = 0;
while (j < nEndVertex)
//Generate the word according the segmentation route
strcat(sAtom, m_graphSeg.m_sAtom[j]);
m_pWordSeg[nIndex][k].sWord[0] = 0; //Init the result ending
strcpy(sNumCandidate, sAtom);
while (sAtom[0] != 0 && (IsAllNum((unsigned char*)sNumCandidate) ||
//Merge all seperate continue num into one number
//sAtom[0]!=0: add in 2002-5-9
strcpy(m_pWordSeg[nIndex][k].sWord, sNumCandidate);
//Save them in the result segmentation
i++; //Skip to next atom now
sAtom[0] = 0;
while (j < nSegRoute[nIndex][i + 1])
//Generate the word according the segmentation route
strcat(sAtom, m_graphSeg.m_sAtom[j]);
strcat(sNumCandidate, sAtom);
unsigned int nLen = strlen(m_pWordSeg[nIndex][k].sWord);
if (nLen == 4 && CC_Find("第上成±—+∶·./",
m_pWordSeg[nIndex][k].sWord) || nLen == 1 && strchr("+-./",
//Only one word
strcpy(sCurWord, m_pWordSeg[nIndex][k].sWord); //Record current word
else if (m_pWordSeg[nIndex][k].sWord[0] == 0)
//Have never entering the while loop
strcpy(m_pWordSeg[nIndex][k].sWord, sAtom);
//Save them in the result segmentation
strcpy(sCurWord, sAtom); //Record current word
//It is a num
if (strcmp("--", m_pWordSeg[nIndex][k].sWord) == 0 || strcmp("—",
m_pWordSeg[nIndex][k].sWord) == 0 || m_pWordSeg[nIndex][k].sWord[0] ==
'-' && m_pWordSeg[nIndex][k].sWord[1] == 0)
//The delimiter "--"
nPOS = 30464; //'w'*256;Set the POS with 'w'
i--; //Not num, back to previous word
//Adding time suffix
char sInitChar[3];
unsigned int nCharIndex = 0; //Get first char
sInitChar[nCharIndex] = m_pWordSeg[nIndex][k].sWord[nCharIndex];
if (sInitChar[nCharIndex] < 0)
nCharIndex += 1;
sInitChar[nCharIndex] = m_pWordSeg[nIndex][k].sWord[nCharIndex];
nCharIndex += 1;
sInitChar[nCharIndex] = '\0';
if (k > 0 && (abs(m_pWordSeg[nIndex][k - 1].nHandle) == 27904 || abs
(m_pWordSeg[nIndex][k - 1].nHandle) == 29696) && (strcmp(sInitChar,
"—") == 0 || sInitChar[0] == '-') && (strlen
(m_pWordSeg[nIndex][k].sWord) > nCharIndex))
//3-4月 //27904='m'*256
//Split the sInitChar from the original word
strcpy(m_pWordSeg[nIndex][k + 1].sWord, m_pWordSeg[nIndex][k].sWord +
m_pWordSeg[nIndex][k + 1].dValue = m_pWordSeg[nIndex][k].dValue;
m_pWordSeg[nIndex][k + 1].nHandle = 27904;
m_pWordSeg[nIndex][k].sWord[nCharIndex] = 0;
m_pWordSeg[nIndex][k].dValue = 0;
m_pWordSeg[nIndex][k].nHandle = 30464; //'w'*256;
m_graphOptimum.SetElement(nStartVertex, nStartVertex + 1,
m_pWordSeg[nIndex][k].dValue, m_pWordSeg[nIndex][k].nHandle,
nStartVertex += 1;
k += 1;
nLen = strlen(m_pWordSeg[nIndex][k].sWord);
if ((strlen(sAtom) == 2 && CC_Find("月日时分秒", sAtom)) || strcmp
(sAtom, "月份") == 0)
strcat(m_pWordSeg[nIndex][k].sWord, sAtom);
strcpy(sCurWord, "未##时");
nPOS = - 29696; //'t'*256;//Set the POS with 'm'
else if (strcmp(sAtom, "年") == 0)
if (IsYearTime(m_pWordSeg[nIndex][k].sWord))
strcat(m_pWordSeg[nIndex][k].sWord, sAtom);
strcpy(sCurWord, "未##时");
nPOS = - 29696; //Set the POS with 't'
strcpy(sCurWord, "未##数");
nPOS = - 27904; //Set the POS with 'm'
i--; //Can not be a time word
//早晨/t 五点/t
if (strcmp(m_pWordSeg[nIndex][k].sWord + strlen
(m_pWordSeg[nIndex][k].sWord) - 2, "点") == 0)
strcpy(sCurWord, "未##时");
nPOS = - 29696; //Set the POS with 't'
if (!CC_Find("∶·./", m_pWordSeg[nIndex][k].sWord + nLen - 2) &&
m_pWordSeg[nIndex][k].sWord[nLen - 1] != '.' &&
m_pWordSeg[nIndex][k].sWord[nLen - 1] != '/')
strcpy(sCurWord, "未##数");
nPOS = - 27904; //'m'*256;Set the POS with 'm'
else if (nLen > strlen(sInitChar))
//Get rid of . example 1.
if (m_pWordSeg[nIndex][k].sWord[nLen - 1] == '.' ||
m_pWordSeg[nIndex][k].sWord[nLen - 1] == '/')
m_pWordSeg[nIndex][k].sWord[nLen - 1] = 0;
m_pWordSeg[nIndex][k].sWord[nLen - 2] = 0;
strcpy(sCurWord, "未##数");
nPOS = - 27904; //'m'*256;Set the POS with 'm'
i--; //Not num, back to previous word
fValue = 0;
nEndVertex = nSegRoute[nIndex][i + 1]; //Ending POS changed to latter
m_pWordSeg[nIndex][k].nHandle = nPOS; //Get the POS of current word
m_pWordSeg[nIndex][k].dValue = fValue;
//(int)(MAX_FREQUENCE*exp(-fValue));//Return the frequency of current word
m_graphOptimum.SetElement(nStartVertex, nEndVertex, fValue, nPOS, sCurWord);
//Generate optimum segmentation graph according the segmentation result
i++; //Skip to next atom
k++; //Accept next word
m_pWordSeg[nIndex][k].sWord[0] = 0;
m_pWordSeg[nIndex][k].nHandle = - 1; //Set ending
return true;
RowFirstDynamicArray<ChainContent> m_graphOptimum)
if (linkedArray.Count == 0)
return null;
//Merge all seperate continue num into one number
MergeContinueNumIntoOne(ref linkedArray);
//The delimiter "--"
ChangeDelimiterPOS(ref linkedArray);
//例如 “3 / -4 / 月”需要拆分成“3 / - / 4 / 月”
SplitMiddleSlashFromDigitalWords(ref linkedArray);
//3、如果最后一个汉字是"点" ,则认为当前数字是时间
CheckDateElements(ref linkedArray);
WordResult[] result = new WordResult[linkedArray.Count];
WordNode pCur = linkedArray.first;
int i = 0;
while (pCur != null)
WordResult item = new WordResult();
item.sWord = pCur.theWord.sWord;
item.nPOS = pCur.theWord.nPOS;
item.dValue = pCur.theWord.dValue;
result[i] = item;
m_graphOptimum.SetElement(pCur.row, pCur.col, new ChainContent(item.dValue, item.nPOS, pCur.sWordInSegGraph));
pCur = pCur.next;
return result;
- 小结
