ICTCLAS.NET——给C/C++程序写.NET wrapper
很多时候想通过.NET调用一些C/C++写的库,但是一直都不知道怎么弄。去网上找了一些资料,大多数是教如何通过托管C++和非托管C++的混合编程来完成C/C++的的类库的.NET Wrapper。
有的时候用C#来实现一个功能的时候,可能要调用windows api,往往都是到网上现查代码,然后粘过来使用,没有细研究到底是怎么做到的。
最近一个朋友用到分词,所以就研究了一些中科院提供的中文分词软件,详情请访问http://ictclas.org/。用了一下还挺好用的,速度没有测试,感觉应该用于学术研究肯定是没有问题的,如果要用到项目中,我觉得还是公司自己实现会比较好。
可惜该组件没有提供.NET的版本的,只有C++的和Java版本的(java版也是通过调用本地c语言的版本)。给的开发包中有一个编译好的dll库。
想想之前调用windows api的时候,不正是从一些系统的dll中导入函数,然后再通过C#代码进行调用的吗?想到这里,我就觉得我可以通过导入该分词dll中的函数用C#来完成改程序的wrapper。
说干就干,我试着导入了几个简单的函数,发现可以调用,感觉非常好,然后花了一段的时间给这个库写了.NET Wrapper,方便自己以后用C#调用该接口来分词。
核心代码如下,用单子模式实现,感觉设计得不是很好,不知道各位有没有什么建议?
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
namespace ICTCLAS.NET
{
//////////////////////////////////////////////////////////////////////////
// character coding types
//////////////////////////////////////////////////////////////////////////
public enum ECodeType
{
CODE_TYPE_UNKNOWN,//type unknown
CODE_TYPE_ASCII,//ASCII
CODE_TYPE_GB,//GB2312,GBK,GB10380
CODE_TYPE_UTF8,//UTF-8
CODE_TYPE_BIG5//BIG5
}
public enum EPosTag
{
/// <summary>
/// 计算所二级标注集
/// </summary>
ICT_POS_MAP_SECOND = 0,
/// <summary>
/// 计算所一级标注集
/// </summary>
ICT_POS_MAP_FIRST = 1,
/// <summary>
/// 北大二级标注集
/// </summary>
PKU_POS_MAP_SECOND = 2,
/// <summary>
/// 北大一级标注集
/// </summary>
PKU_POS_MAP_FIRST = 3,
/// <summary>
/// 标注集 数量
/// </summary>
POS_MAP_NUMBER = 4,
/// <summary>
/// 词性标记最大字节数
/// </summary>
POS_SIZE = 8
}
[StructLayout(LayoutKind.Explicit)]
struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos;
[FieldOffset(12)]
public int sPosLow;
[FieldOffset(16)]
public int POS_id;
[FieldOffset(20)]
public int word_ID;
[FieldOffset(24)]
public int word_type;
[FieldOffset(28)]
public int weight;
}
public struct Word
{
public string str;
public int pos_id;
public int word_id;
public int weight;
public int word_type;
}
public class WordSegger
{
class Nested
{
static Nested()
{
}
internal static readonly WordSegger instance = new WordSegger();
}
private static object lockobj = new object();
private static bool inited = false;
/// <summary>
/// 获取分词器,第一次调用需要提供path参数
/// 以后调用不传参数即可
/// </summary>
/// <param name="path">指定配置文件和data文件位置</param>
/// <returns></returns>
public static WordSegger GetInstance(string path = "")
{
if (inited)
{
return Nested.instance;
}
lock (lockobj)
{
if (!inited)
{
inited = ICTCLAS_Init(path);
if (!inited)
{
return null;
}
}
return Nested.instance;
}
}
/// <summary>
/// 主动释放切词所占的托管资源
/// </summary>
public static void Release()
{
if (inited)
{
lock (lockobj)
{
if (inited)
{
ICTCLAS_Exit();
inited = false;
}
}
}
}
/// <summary>
/// 构建切词对象
/// </summary>
private WordSegger()
{
}
/// <summary>
/// 析构函数,主动调用释放资源
/// </summary>
~WordSegger()
{
Release();
}
/// <summary>
/// 切词
/// </summary>
/// <param name="str">需要切词的字符串</param>
/// <param name="ecode">编码</param>
/// <param name="posTagged">是否进行词性标注</param>
/// <returns></returns>
public Word[] SegStr(string str, ECodeType ecode, bool posTagged = false)
{
result_t[] result = new result_t[str.Length];
// 切词
int cnt = ICTCLAS_ParagraphProcessAW(str, result, ecode, posTagged);
Word[] words = new Word[cnt];
byte[] mybyte = Encoding.Default.GetBytes(str);
for (int i = 0; i < cnt; i++)
{
byte[] byteWord = new byte[result[i].length];
Array.Copy(mybyte, result[i].start, byteWord, 0, result[i].length);
words[i].str = Encoding.Default.GetString(mybyte, result[i].start, result[i].length);
words[i].pos_id = result[i].POS_id;
words[i].word_id = result[i].word_ID;
words[i].weight = result[i].weight;
words[i].word_type = result[i].word_type;
}
return words;
}
/// <summary>
/// 对文本文件切词
/// </summary>
/// <param name="src">源文件路径</param>
/// <param name="ct">编码</param>
/// <param name="des">目标文件路径</param>
/// <param name="postag">是否词性标注</param>
/// <returns>切词是否成功</returns>
public bool SegFile(string src, ECodeType ct, string des, bool postag = false)
{
return ICTCLAS_FileProcess(src, des, ct, postag);
}
/// <summary>
/// 设置词性标注集
/// </summary>
/// <param name="nPOSmap">词性标注集</param>
/// <returns>成功/失败</returns>
public bool SetPosTagMap(EPosTag nPOSmap)
{
return ICTCLAS_SetPOSmap(nPOSmap);
}
/// <summary>
/// 从文件中导入用户词典
///
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="path">用户词典文件路径</param>
/// <param name="ct">编码</param>
/// <returns>导入的用户自定词的数量</returns>
public uint ImportUserDictFile(string path, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDictFile(path, ct);
}
/// <summary>
/// 导入用户词汇
/// </summary>
/// <param name="userDict">
/// 用户词汇
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="ct">编码</param>
/// <returns>导入的用户词汇数量</returns>
public uint ImportUserDict(string userDict, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDict(userDict, userDict.Length, ct);
}
public bool SaveUserDict()
{
return ICTCLAS_SaveTheUsrDic() == 0 ? false : true;
}
const string DLLPATH = @"ICTCLAS50.dll";
/// <summary>
/// 初始化,调用其它任何接口前,必须保证本接口调用成功!
/// </summary>
/// <param name="sInitDirPath">配置文件及data文件所在路径</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Init")]
private static extern bool ICTCLAS_Init(string sInitDirPath);
/// <summary>
/// 释放资源,所有操作完成后,请调用本接口释放相关资源!
/// </summary>
/// <returns>是否成功</returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Exit")]
private static extern bool ICTCLAS_Exit();
/// <summary>
/// 指定词性标注集
/// </summary>
/// <param name="nPOSmap">词性标注集</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SetPOSmap")]
private static extern bool ICTCLAS_SetPOSmap(EPosTag nPOSmap);
/// <summary>
/// 导入用户自定义词典
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="pszFileName">用户词典路径名称</param>
/// <param name="codeType">词典编码类型</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDictFile")]
private static extern uint ICTCLAS_ImportUserDictFile(string pszFileName, ECodeType codeType = ECodeType.CODE_TYPE_UNKNOWN);
/// <summary>
/// 导入用户词典
/// 1.本接口将根据用户输入的词汇,生成相应的用户词典。
/// 2.该词典,将覆盖内存里原有的用户词典。
/// </summary>
/// <param name="pszDictBuffer">
/// 用户词典字符串
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="nLength">字符串长度</param>
/// <param name="codeType">编码类型</param>
/// <returns>成功导入的词汇数量</returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDict")]
private static extern uint ICTCLAS_ImportUserDict(string pszDictBuffer, int length, ECodeType codeType);
/// <summary>
/// 保存用户词典
/// 1.本接口将会覆盖原有/data/文件夹用户相关词典。
/// 2.用户可在配置文件中,指定下次是否使用该词典。
/// </summary>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SaveTheUsrDic")]
private static extern int ICTCLAS_SaveTheUsrDic();
/// <summary>
/// 切词处理
/// </summary>
/// <param name="sParagraph">要处理的文本</param>
/// <param name="eCT">文本编码</param>
/// <param name="bPOSTagged">是否词性标注</param>
/// <param name="result">切词结果</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcessAW")]
private static extern int ICTCLAS_ParagraphProcessAW(string sParagraph, [Out, MarshalAs(UnmanagedType.LPArray)]result_t[] result, ECodeType eCT, bool bPOSTagged = false);
/// <summary>
/// 文本文件分词
/// </summary>
/// <param name="sSrcFilename">待切词文件名</param>
/// <param name="eCt">编码</param>
/// <param name="sDsnFilename">目标文件名</param>
/// <param name="bPOStagged">是否词性标注</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcess")]
private static extern bool ICTCLAS_FileProcess(string sSrcFilename, string sDsnFilename, ECodeType eCt, bool bPOStagged = false);
}
}
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Runtime.InteropServices;
namespace ICTCLAS.NET
{
//////////////////////////////////////////////////////////////////////////
// character coding types
//////////////////////////////////////////////////////////////////////////
public enum ECodeType
{
CODE_TYPE_UNKNOWN,//type unknown
CODE_TYPE_ASCII,//ASCII
CODE_TYPE_GB,//GB2312,GBK,GB10380
CODE_TYPE_UTF8,//UTF-8
CODE_TYPE_BIG5//BIG5
}
public enum EPosTag
{
/// <summary>
/// 计算所二级标注集
/// </summary>
ICT_POS_MAP_SECOND = 0,
/// <summary>
/// 计算所一级标注集
/// </summary>
ICT_POS_MAP_FIRST = 1,
/// <summary>
/// 北大二级标注集
/// </summary>
PKU_POS_MAP_SECOND = 2,
/// <summary>
/// 北大一级标注集
/// </summary>
PKU_POS_MAP_FIRST = 3,
/// <summary>
/// 标注集 数量
/// </summary>
POS_MAP_NUMBER = 4,
/// <summary>
/// 词性标记最大字节数
/// </summary>
POS_SIZE = 8
}
[StructLayout(LayoutKind.Explicit)]
struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos;
[FieldOffset(12)]
public int sPosLow;
[FieldOffset(16)]
public int POS_id;
[FieldOffset(20)]
public int word_ID;
[FieldOffset(24)]
public int word_type;
[FieldOffset(28)]
public int weight;
}
public struct Word
{
public string str;
public int pos_id;
public int word_id;
public int weight;
public int word_type;
}
public class WordSegger
{
class Nested
{
static Nested()
{
}
internal static readonly WordSegger instance = new WordSegger();
}
private static object lockobj = new object();
private static bool inited = false;
/// <summary>
/// 获取分词器,第一次调用需要提供path参数
/// 以后调用不传参数即可
/// </summary>
/// <param name="path">指定配置文件和data文件位置</param>
/// <returns></returns>
public static WordSegger GetInstance(string path = "")
{
if (inited)
{
return Nested.instance;
}
lock (lockobj)
{
if (!inited)
{
inited = ICTCLAS_Init(path);
if (!inited)
{
return null;
}
}
return Nested.instance;
}
}
/// <summary>
/// 主动释放切词所占的托管资源
/// </summary>
public static void Release()
{
if (inited)
{
lock (lockobj)
{
if (inited)
{
ICTCLAS_Exit();
inited = false;
}
}
}
}
/// <summary>
/// 构建切词对象
/// </summary>
private WordSegger()
{
}
/// <summary>
/// 析构函数,主动调用释放资源
/// </summary>
~WordSegger()
{
Release();
}
/// <summary>
/// 切词
/// </summary>
/// <param name="str">需要切词的字符串</param>
/// <param name="ecode">编码</param>
/// <param name="posTagged">是否进行词性标注</param>
/// <returns></returns>
public Word[] SegStr(string str, ECodeType ecode, bool posTagged = false)
{
result_t[] result = new result_t[str.Length];
// 切词
int cnt = ICTCLAS_ParagraphProcessAW(str, result, ecode, posTagged);
Word[] words = new Word[cnt];
byte[] mybyte = Encoding.Default.GetBytes(str);
for (int i = 0; i < cnt; i++)
{
byte[] byteWord = new byte[result[i].length];
Array.Copy(mybyte, result[i].start, byteWord, 0, result[i].length);
words[i].str = Encoding.Default.GetString(mybyte, result[i].start, result[i].length);
words[i].pos_id = result[i].POS_id;
words[i].word_id = result[i].word_ID;
words[i].weight = result[i].weight;
words[i].word_type = result[i].word_type;
}
return words;
}
/// <summary>
/// 对文本文件切词
/// </summary>
/// <param name="src">源文件路径</param>
/// <param name="ct">编码</param>
/// <param name="des">目标文件路径</param>
/// <param name="postag">是否词性标注</param>
/// <returns>切词是否成功</returns>
public bool SegFile(string src, ECodeType ct, string des, bool postag = false)
{
return ICTCLAS_FileProcess(src, des, ct, postag);
}
/// <summary>
/// 设置词性标注集
/// </summary>
/// <param name="nPOSmap">词性标注集</param>
/// <returns>成功/失败</returns>
public bool SetPosTagMap(EPosTag nPOSmap)
{
return ICTCLAS_SetPOSmap(nPOSmap);
}
/// <summary>
/// 从文件中导入用户词典
///
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="path">用户词典文件路径</param>
/// <param name="ct">编码</param>
/// <returns>导入的用户自定词的数量</returns>
public uint ImportUserDictFile(string path, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDictFile(path, ct);
}
/// <summary>
/// 导入用户词汇
/// </summary>
/// <param name="userDict">
/// 用户词汇
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="ct">编码</param>
/// <returns>导入的用户词汇数量</returns>
public uint ImportUserDict(string userDict, ECodeType ct = ECodeType.CODE_TYPE_UNKNOWN)
{
return ICTCLAS_ImportUserDict(userDict, userDict.Length, ct);
}
public bool SaveUserDict()
{
return ICTCLAS_SaveTheUsrDic() == 0 ? false : true;
}
const string DLLPATH = @"ICTCLAS50.dll";
/// <summary>
/// 初始化,调用其它任何接口前,必须保证本接口调用成功!
/// </summary>
/// <param name="sInitDirPath">配置文件及data文件所在路径</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Init")]
private static extern bool ICTCLAS_Init(string sInitDirPath);
/// <summary>
/// 释放资源,所有操作完成后,请调用本接口释放相关资源!
/// </summary>
/// <returns>是否成功</returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Exit")]
private static extern bool ICTCLAS_Exit();
/// <summary>
/// 指定词性标注集
/// </summary>
/// <param name="nPOSmap">词性标注集</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SetPOSmap")]
private static extern bool ICTCLAS_SetPOSmap(EPosTag nPOSmap);
/// <summary>
/// 导入用户自定义词典
/// 用户导入词汇文件格式如下:
/// 1.词语与词性用‘@@’间隔。例如:“中科院@@nr;
/// 2.一行一词;
/// 3.词性可省略
/// </summary>
/// <param name="pszFileName">用户词典路径名称</param>
/// <param name="codeType">词典编码类型</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDictFile")]
private static extern uint ICTCLAS_ImportUserDictFile(string pszFileName, ECodeType codeType = ECodeType.CODE_TYPE_UNKNOWN);
/// <summary>
/// 导入用户词典
/// 1.本接口将根据用户输入的词汇,生成相应的用户词典。
/// 2.该词典,将覆盖内存里原有的用户词典。
/// </summary>
/// <param name="pszDictBuffer">
/// 用户词典字符串
/// 1.词语与词性用‘@@’间隔;
/// 2.词与词之间用 半角‘;’间隔;
/// 3.词性可省略
/// 例如:“中科院@@nr;分词 v;系统@@adj;……;”,
/// 或者:“中科院;分词;系统;……;”
/// </param>
/// <param name="nLength">字符串长度</param>
/// <param name="codeType">编码类型</param>
/// <returns>成功导入的词汇数量</returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDict")]
private static extern uint ICTCLAS_ImportUserDict(string pszDictBuffer, int length, ECodeType codeType);
/// <summary>
/// 保存用户词典
/// 1.本接口将会覆盖原有/data/文件夹用户相关词典。
/// 2.用户可在配置文件中,指定下次是否使用该词典。
/// </summary>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SaveTheUsrDic")]
private static extern int ICTCLAS_SaveTheUsrDic();
/// <summary>
/// 切词处理
/// </summary>
/// <param name="sParagraph">要处理的文本</param>
/// <param name="eCT">文本编码</param>
/// <param name="bPOSTagged">是否词性标注</param>
/// <param name="result">切词结果</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcessAW")]
private static extern int ICTCLAS_ParagraphProcessAW(string sParagraph, [Out, MarshalAs(UnmanagedType.LPArray)]result_t[] result, ECodeType eCT, bool bPOSTagged = false);
/// <summary>
/// 文本文件分词
/// </summary>
/// <param name="sSrcFilename">待切词文件名</param>
/// <param name="eCt">编码</param>
/// <param name="sDsnFilename">目标文件名</param>
/// <param name="bPOStagged">是否词性标注</param>
/// <returns></returns>
[DllImport(DLLPATH, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcess")]
private static extern bool ICTCLAS_FileProcess(string sSrcFilename, string sDsnFilename, ECodeType eCt, bool bPOStagged = false);
}
}
整个工程代码:ICTCLAS.NET.rar
ICTCLAS分词接口建议到官方下载,不过为了对应版本,也可以从如下地址下载:
本文基于署名 2.5 中国大陆许可协议发布,欢迎转载,演绎或用于商业目的,但是必须保留本文的署名小橋流水(包含链接)。如您有任何疑问或者授权方面的协商,请给我发邮件。