C#网页采集

  /// <summary>
        /// 返回提取数组
        /// </summary>
        /// <param name="rex">正则</param>
        /// <param name="urlValue">字符串</param>
        /// <returns></returns>
        private string[] rexID(string rex, string urlValue)
        {
            ArrayList al = new ArrayList();
            string strRegex = rex;
            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(urlValue);
            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();
                string zregexStr = rex;
                Regex l = new Regex(zregexStr, RegexOptions.None);
                Match mc = l.Match(strNew);
                string dataStr = mc.Groups["key"].Value;
                // 过滤重复的URL 
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }
                if (!rep)
                {
                    al.Add(dataStr);
                }
            }
            string[] shuzu = new string[al.Count];
            int id = 0;
            foreach (string item in al)
            {
                shuzu[id] = item;
                id++;
            }
            return shuzu;
        }

 

posted @ 2014-01-11 11:49  秋千,为谁荡  阅读(289)  评论(0编辑  收藏  举报