Daily Report 2012.11.06 刘宇翔

今天对李忠修改过的match函数进行测试,修正bug,并进行优化。

将中文分词方法加入到算法中,提高了算法的精确度。

但中文分词方法加入到算法后,出现在一些新问题,对新出现的问题进行了修正和优化。

测试过程中运用了900条字符串的样例。

更新后代码如下:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;

namespace match0
{
    class Program
    {
        static public int match(string word,string keyword)
        {
            int matchDegree = -1;//word、keyword匹配级别
            //string[] wordlist = word.Split(new char[] { ' ' },StringSplitOptions.RemoveEmptyEntries);
            List<string> wordlist = ChineseWordSegmentation.word_segmentation(word);
            int wlN = wordlist.Count();//word关键词数量
            if (wlN == 0)
            {
                string[] wordlist2 = word.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                wlN = wordlist2.Count();
                for (int i = 0; i < wlN; i++)
                    wordlist.Add(wordlist2[i]);
            }
            

            //输入有空,返回************************************************************************
            if (word.Length == 0 || keyword.Length == 0)//输入有空,返回-1
                return matchDegree;

            //**************************************************************************************

            //模糊匹配,返回0或1或2或3或4***********************************************************
            if (wlN == 1)//word只含一个关键词
            {
                matchDegree = wordmatch(wordlist[0], keyword);
            }
            else //word含多个关键词
            {
                List<int> wkDegree = new List<int>();
                for (int i = 0; i < wlN; i++)
                    wkDegree.Add(0);
                for (int j = 0; j < wlN; j++)
                {
                    wkDegree[j] = wordmatch(wordlist[j], keyword);
                }
                
                //取wkDegree[]最大数法
                int Max = 0;
                for (int i = 0; i < wlN; i++)
                {
                    if (wkDegree[i] > Max)
                    {
                        Max = wkDegree[i];
                    }
                }
                matchDegree = Max;

                return matchDegree;
            }
            //***************************************************************************************

            return matchDegree;//因错误等不明原因跳出,返回-1
        }

        static public int wordmatch(string w, string keyword)//单个关键词对keyword的模糊匹配,w为单个关键词
        {
            int wmatchDegree = 0;//w、keyword匹配级别
            int Max = 0;
            List<int> wkDegree=new List<int>();
            //string[] keywordlist = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);//含新中文分词算法
            int klN = keywordlist.Count();//keyword关键词数量
            if (klN == 0)
            {
                string[] keywordlist2 = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                klN = keywordlist2.Count();
                for (int i = 0; i < klN; i++)
                    keywordlist.Add(keywordlist2[i]);
            }
            for (int i = 0; i < klN;i++ )
                wkDegree.Add(0);
            for (int j = 0; j < klN; j++)
            {
                wkDegree[j] = wkmatch(w, keywordlist[j]);
            }
            Max = wkDegree[0];
            for (int i = 1; i < klN; i++)
            {
                if (wkDegree[i] > Max)
                    Max = wkDegree[i];
            }
            wmatchDegree = Max;
            return wmatchDegree;
        }

        static public int wkmatch(string w, string k)//单个关键词对单个k的模糊匹配,k为keyword单个关键词
        {
            int wkDegree = 0;//w、k匹配级别
            int YorN = 0;
            int m = w.Length;
            int n = k.Length;
            w = w.ToLower();
            k = k.ToLower();
            int minLength = 0;          //定义一个最小长度变量用于存储w和k中长度的最小值

            //模糊度4
            if ( w == k )
            {
                wkDegree = 4*w.Length;
                return wkDegree;
            }

            //模糊度3
            else if (m < n)
            {
                minLength = m;
                YorN=m3(w, k);
            }
            else
            {
                minLength = n;
                YorN=m3(k, w);
            }
            if (YorN == 1)
            {
                wkDegree = 3 * minLength;
                return wkDegree;
            }

            //模糊度2
            if (m < n)
            {
                YorN = m2(w, k);
            }
            else
            {
                YorN = m2(k, w);
            }
            if (YorN == 1)
            {
                if (editDistance(w, k) < minLength / 4)                     //当两个字符串的模糊度通过模糊度计算后得到的结果为2时,不直接就认为他们的模糊度为2,还要考虑编辑距离,如果编辑距离小于某一个值时,就认为他们的模糊度为三
                    wkDegree = 3 * minLength;
                else
                    wkDegree = 2 * minLength;
                return wkDegree;
            }

            //模糊度1
            if (m < n)
            {
                YorN = m1(w, k);
            }
            else
            {
                YorN = m1(k, w);
            }
            if (YorN == 1)
            {
                if (editDistance(w, k) < minLength / 10)                 //编辑距离的使用原因同上;
                    wkDegree = 2 * minLength;
                else
                    wkDegree = 1 * minLength;
                return wkDegree;
            }

            //模糊度0
            else
            {
                wkDegree = 0;
                return wkDegree;
            }
        }

        //模糊度3
        static public int m3(string x, string y)
        {
            int ans=0;
            int m = x.Length;
            int n = y.Length;
            int i = 0;
            int j = 0;
            int k = 0;
            while (i < m && j < n)
            {
                if (x[i] == y[j])
                {
                    i++;
                    j++;
                    if (i == m)
                    {
                        ans = 1;
                        break;
                    }
                }
                else
                {
                    i = 0;
                    k++;
                    j = k;
                }
            }
            return ans;
        }

        //模糊度2
        static public int m2(string x, string y)
        {
            int ans = 0;
            int m = x.Length;
            int n = y.Length;

            int l = 0;
            int Ml = 0;               //最大匹配长度
            for (int i = 0; i < (m/2+1); i++)
            {
                int i2 = i;
                int j = 0;
                int k = 0;
                while(j<n)
                {
                    if (x[i2] == y[j])
                    {
                        i2++;
                        j++;
                        l++;
                        if (i2 >= m)
                        {
                            i2 = i;
                            k++;
                            j = k;
                            if (l > Ml)
                                Ml = l;
                        }
                    }
                    else 
                    {
                        i2 = i;
                        k++;
                        j = k;
                        if( l > Ml )
                            Ml = l;
                    }
                }
            }

            if (Ml > (m / 2))           //当最大匹配长度大于m/2时就说明满足模糊程度为2的条件
                ans = 1;
            else
                ans = 0;

            return ans;
        }

        //模糊度1
        static public int m1(string x, string y)
        {
            int ans = 0;
            int m = x.Length;
            int n = y.Length;
            for (int i = 0; i < m; i++)
            {
                int j;

                for (j = 0; j < n; j++)
                {
                    if (x[i] == y[j])
                    {
                        ans = 1;
                        break;           //当ans已经为1时就可以跳出循环了
                    }
                }

                //当ans已经为1时就可以跳出循环了
                if (j < n)
                    break;
            }
            return ans;
        }

        //编写一个求两个字符串编辑距离的方法,提高容错率
        static public int editDistance(string x , string y) {
            //定义三个常量分别表示插入、删除和修改一个字符所消耗的编辑次数
            const int COSTINDEL = 1;
            const int COSTININS = 1;
            const int COSTINSUB = 1;

            int xLength = x.Length, yLength = y.Length;
            //二维数组distance用于存储动态规划过程中每一步的编辑距离
            int row = xLength + 1, low = yLength + 1;
            int[][] distance=new int[row][];
            for (int i = 0; i < row; i++) {
                distance[i] = new int[low];
            }

            //初始化距离distance二维表的行和列
            distance[0][0] = 0;
            for (int i = 1; i < row; i++) {
                distance[i][0] = distance[i - 1][0] + COSTINDEL;
            }
            for (int j = 1; j < low; j++) {
                distance[0][j] = distance[0][j - 1] + COSTININS;
            }

            //利用动态规划算法求x和y的编辑距离
            for (int i = 1; i < row; i++) {
                for (int j = 1; j < low; j++) {
                    //分别用delDistance、insDistance和subDistance暂存要编辑到distance[i][j]的各种方式的编辑次数
                    int delDistance = distance[i - 1][j] + COSTINDEL;
                    int insDistance = distance[i][j - 1] + COSTININS;
                    int subDistance = distance[i - 1][j - 1] + (x[i - 1] == y[j - 1] ? 0 : COSTINSUB);

                    int temp;
                    distance[i][j] = subDistance < (temp = (delDistance < insDistance ? delDistance : insDistance)) ? subDistance : temp;    //选择一个编辑次数最少的值附给distance[i][j]
                }
            }

            return distance[xLength][yLength];             //返回两个数的编辑距离的
        }

        //将标点符号进行更改的从半角转化为全角的方法
        static string half_to_whole(string s) { 
            int sLength=s.Length;
            char[] c=s.ToCharArray();
            for (int i = 0; i < sLength; i++) { 
                byte[] b = System.Text.Encoding.Unicode.GetBytes(c,i,1);
                if (b.Length == 2) {
                    //if (b[1] == 0 && !(c[i] >= 'a' && c[i] <= 'z' || c[i] >= 'A' && c[i] <= 'Z' || c[i] >= '0' && c[i] <= '9'))
                    if (b[1] == 0)
                    {
                        b[0] = (byte)(b[0] - 32);
                        b[1] = 255;
                        c[i] = System.Text.Encoding.Unicode.GetChars(b)[0];  
                    }
                }
            }

            string news = new string(c);
            return news;
        }

        public class eachline
        {
            public string line;
            public int matchpoint;
            public int num;
        }

        static void Main(string[] args)//供测试用主函数提供各函数返回值
        {
            int a;
            string keyword = Console.ReadLine();
            int count = 0;
            StreamReader objReader = new StreamReader("test.txt", System.Text.Encoding.Default);
            string sLine = "";
            List<eachline> LineList = new List<eachline>();
            while (sLine != null)
            {
                sLine = objReader.ReadLine();
                if (sLine != null && !sLine.Equals(""))
                {
                    a = match(sLine, keyword);
                    eachline l = new eachline();
                    l.line = sLine;
                    l.matchpoint = match(sLine, keyword);
                    l.num = count;
                    LineList.Add(l);
                    count++;
                }
            }
            objReader.Close();
            eachline temp;
            int i, j;
            j = 1;
            while (j < count)//判断长度    
            {
                for (i = 0; i < count - j; i++)
                {
                    if (LineList[i].matchpoint < LineList[i + 1].matchpoint)
                    {
                        temp = LineList[i];
                        LineList[i] = LineList[i + 1];//交换数据    
                        LineList[i + 1] = temp;
                    }
                }
                j++;
            }
            List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);
            for (i = 0; i < keywordlist.Count;i++ )
                Console.WriteLine(keywordlist[i]);
                for (i = 0; i < 20; i++)
                {
                    Console.WriteLine(LineList[i].line);
                    Console.WriteLine(LineList[i].matchpoint);
                    List<string> wordlist = ChineseWordSegmentation.word_segmentation(LineList[i].line);
                    for (j = 0; j < wordlist.Count; j++)
                        Console.Write(wordlist[j] + ' ');
                    Console.WriteLine(' ');
                }

            //Console.WriteLine("");
            //List<string> xList = ChineseWordSegmentation.word_segmentation(x);
            //List<string> yList = ChineseWordSegmentation.word_segmentation(y);

            //Console.WriteLine(x + ":");
            //for (int i = 0; i < xList.Count; i++) {
            //    Console.WriteLine(xList[i]);
            //}
            //Console.WriteLine(y + ":");
            //for (int i = 0; i < yList.Count; i++) {
            //    Console.WriteLine(yList[i]);
            //}

            
        }
    }
}

但目前仍然能在语义上提高算法的精确度。

之后需要优化多关键词语义分析计算匹配程度,并测试修改过的代码,修正错误。

posted @ 2012-11-06 23:55  DOOM_buaascse  阅读(232)  评论(0编辑  收藏  举报