ICTCLAS-2011-使用小结

1. 简述

    以前在百度空间上写过一篇,最近又用到这个工具包,以前博文里面的废话太多,而且代码方面说的太少,这里再重新写一写。

2. 哪里下载

    官方网站的下载页面上面版本很全很多,不过C#版本的接口貌似是2008年的,2011年的只提供了C/C++和JAVA的接口。这里使用的是张华平博士(ICTCLAS的作者)博客上面链接的ICTCLAS2011的最新u0404 版本

3. 文件组织

    下载链接里面有3个压缩文件,我使用的是ICTCLAS2011-SDK-release.rar,具体把哪些文件放到C#工程里面,参考一下readme.txt文件就行,主要三个部分,第一个部分是配置文件、词典文件、日志文件,第二部分是数据文件,包括标注集等,第三部分,是接口文件。

    实际上两个个步骤:第一步,把"ICTCLAS2011-SDK-release"目录下的所有和Data文件夹都拷贝到debug目录下,第二步,把"ICTCLAS2011-SDK-release\Windows下的C#接口"目录下的所有文件拷贝到debug目录下。第二步会有一些文件覆盖,直接覆盖上就好了,覆盖的就是接口文件之类的。

4. API调用

    文档里面说的很清楚,这里再说一说比较重要的几个方面。

4.1 接口引入

    在要使用的类文件中,添加如下三个部分代码。
    第一个部分是引用命名空间:

using System.IO;
using System.Runtime.InteropServices;

    第二个部分是结构体定义 

View Code
    [StructLayout(LayoutKind.Explicit)]
public struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos;
[FieldOffset(12)]
public int sPosLow;
[FieldOffset(16)]
public int POS_id;
[FieldOffset(20)]
public int word_ID;
[FieldOffset(24)]
public int word_type;
[FieldOffset(28)]
public int weight;

}

   第三个部分,是路径设置和API导入

View Code
        const string path = @"ICTCLAS2011.dll";
#region
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Init")]
public static extern bool ICTCLAS_Init(String sInitDirPath);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcess")]
public static extern String ICTCLAS_ParagraphProcess(String sParagraph, int bPOStagged);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Exit")]
public static extern bool ICTCLAS_Exit();

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDict")]
public static extern int ICTCLAS_ImportUserDict(String sFilename);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcess")]
public static extern bool ICTCLAS_FileProcess(String sSrcFilename, String sDestFilename, int bPOStagged);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcessEx")]
public static extern bool ICTCLAS_FileProcessEx(String sSrcFilename, String sDestFilename);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_GetParagraphProcessAWordCount")]
static extern int ICTCLAS_GetParagraphProcessAWordCount(String sParagraph);
//ICTCLAS_GetParagraphProcessAWordCount
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcessAW")]
static extern void ICTCLAS_ParagraphProcessAW(int nCount, [Out, MarshalAs(UnmanagedType.LPArray)] result_t[] result);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_AddUserWord")]
static extern int ICTCLAS_AddUserWord(String sWord);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SaveTheUsrDic")]
static extern int ICTCLAS_SaveTheUsrDic();


[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_DelUsrWord")]
static extern int ICTCLAS_DelUsrWord(String sWord);
#endregion

4.2 初始化ICTCLAS   

if (!ICTCLAS_Init(null))
{
MessageBox.Show("Init ICTCLAS failed!");
}

    样例程序基本上是上面这样的,如果前面文件组织的不对,比如少了配置文件等等,就是初始化失败。另外需要注意默认是从程序的当前目录下进行寻找ICTCLAS的那些文件的,即Environment.CurrentDirectory指定的路径。如果使用openfilediaolg这样的组件,可能会改变程序的当前目录,为了解决这个问题,需要在ICTCLAS_Init时指定目录,代码如下:

string sInitDirPath = AppDomain.CurrentDomain.BaseDirectory;
if (!ICTCLAS_Init(sInitDirPath))
{
MessageBox.Show("Init ICTCLAS failed!");
}

4.3 分词和词性标注

    带分词的字符串都要转化为全角形式,如果字符串中有半角有全角,分词结果不对。半角全角转化的代码网上很多,这里给一个以前网上找的。注意所谓全角形式主要是针对英文字母,中文一个字符都是两个字节,而英文一个字符在半角形势下是一个字节,在全角形式下是两个字节。   

View Code
        /// <summary>
/// 转全角的函数(SBC case)
/// </summary>
/// <param name="input">任意字符串</param>
/// <returns>全角字符串</returns>
///<remarks>
///全角空格为12288,半角空格为32
///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
///</remarks>
private string ToSBC(string input)
{
//半角转全角:
char[] c = input.ToCharArray();
for (int i = 0; i < c.Length; i++)
{
if (c[i] == 32)
{
c[i] = (char)12288;
continue;
}
if (c[i] < 127)
c[i] = (char)(c[i] + 65248);
}
return new string(c);
}

/// <summary> 转半角的函数(DBC case) </summary>
/// <param name="input">任意字符串</param>
/// <returns>半角字符串</returns>
///<remarks>
///全角空格为12288,半角空格为32
///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
///</remarks>
private string ToDBC(string input)
{
char[] c = input.ToCharArray();
for (int i = 0; i < c.Length; i++)
{
if (c[i] == 12288)
{
c[i] = (char)32;
continue;
}
if (c[i] > 65280 && c[i] < 65375)
c[i] = (char)(c[i] - 65248);
}
return new string(c);
}

    分词的代码如下:

int count = ICTCLAS_GetParagraphProcessAWordCount(srcText);//先得到结果的词数
result_t[] result = new result_t[count];//在客户端申请资源
ICTCLAS_ParagraphProcessAW(count, result);//获取结果存到客户的内存中

    结果都存储在result这个数组中。这里注意通过result_t类型中的index和len成员可以找到分词后词语,通过result_t类型中的POS_id成员可以找到分词后词语对应的词性。这里需要注意index和len使用的时候都要除以2。比如我们要找到srcText分词后的第1个词语,即为srcText.Substring(result[0].start/2, result[0].length/2),其词性为result[0].POS_id。这里说一下这个POS_id,这个id是int类型的,对应当前使用的标注集文件中词性的位置。默认使用的是Data文件夹下的ICTPOS.map标注集文件,值得注意的是,这个文件中有95行,一行一个词性,假设POS_id当前是24,那么其对应的词性是文件中的第24行,即这个ID是从0下标开始算的。所以要获取真正的词性的话,还有首先把这些词性从标注集文件中读出来才行。

5. 程序示例   

View Code
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Runtime.InteropServices;

namespace ICTCLAS_2011_Tool
{
// ICTCLAS
[StructLayout(LayoutKind.Explicit)]
public struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos;
[FieldOffset(12)]
public int sPosLow;
[FieldOffset(16)]
public int POS_id;
[FieldOffset(20)]
public int word_ID;
[FieldOffset(24)]
public int word_type;
[FieldOffset(28)]
public int weight;

}


public partial class MainForm : Form
{
// ICTCLAS
const string path = @"ICTCLAS2011.dll";
#region
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Init")]
public static extern bool ICTCLAS_Init(String sInitDirPath);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcess")]
public static extern String ICTCLAS_ParagraphProcess(String sParagraph, int bPOStagged);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Exit")]
public static extern bool ICTCLAS_Exit();

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDict")]
public static extern int ICTCLAS_ImportUserDict(String sFilename);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcess")]
public static extern bool ICTCLAS_FileProcess(String sSrcFilename, String sDestFilename, int bPOStagged);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcessEx")]
public static extern bool ICTCLAS_FileProcessEx(String sSrcFilename, String sDestFilename);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_GetParagraphProcessAWordCount")]
static extern int ICTCLAS_GetParagraphProcessAWordCount(String sParagraph);
//ICTCLAS_GetParagraphProcessAWordCount
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcessAW")]
static extern void ICTCLAS_ParagraphProcessAW(int nCount, [Out, MarshalAs(UnmanagedType.LPArray)] result_t[] result);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_AddUserWord")]
static extern int ICTCLAS_AddUserWord(String sWord);

[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SaveTheUsrDic")]
static extern int ICTCLAS_SaveTheUsrDic();


[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_DelUsrWord")]
static extern int ICTCLAS_DelUsrWord(String sWord);
#endregion

public MainForm()
{
InitializeComponent();
}

List<string> POS_MAP = null;

private void MainForm_Load(object sender, EventArgs e)
{
textBox_srcText.Text = "点击下载超女纪敏佳深受观众喜爱。禽流感爆发在非典之后。";
// init ictclas
Environment.CurrentDirectory = @"C:\";
string sInitDirPath = AppDomain.CurrentDomain.BaseDirectory;
if (!ICTCLAS_Init(sInitDirPath))
{
MessageBox.Show("Init ICTCLAS failed!");
}
// get pos map
POS_MAP = new List<string>();
#region
string mapFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data\ICTPOS.map");
FileStream fs = new FileStream(mapFile, FileMode.Open, FileAccess.Read, FileShare.Read);
StreamReader sr = new StreamReader(fs);
string line;
while ((line = sr.ReadLine()) != null)
{
POS_MAP.Add(line);
}
sr.Close();
fs.Close();
#endregion
}

private void button_start_Click(object sender, EventArgs e)
{
string srcText = textBox_srcText.Text;
if (srcText == "")
return;
// 分词
int count = ICTCLAS_GetParagraphProcessAWordCount(srcText);//先得到结果的词数
result_t[] result = new result_t[count];//在客户端申请资源
ICTCLAS_ParagraphProcessAW(count, result);//获取结果存到客户的内存中
// 显示
StringBuilder dstText = new StringBuilder();
for (int i = 0; i < result.Length; i++)
{
string word = srcText.Substring(result[i].start/2, result[i].length/2);
string type = POS_MAP[result[i].POS_id];
dstText.Append(word + @" /" + type + " ");
}
textBox_dstText.Text = dstText.ToString();
}
}
}

    程序输出如下:

       

5. 参考

    官方网站    http://ictclas.cn/index.html
    ICTCLAS2011 u0404    http://hi.baidu.com/drkevinzhang/blog/item/25074dae23da861c4a36d626.html 

posted @ 2011-10-09 17:31  xiaodongrush  阅读(3000)  评论(2编辑  收藏  举报