网络爬虫(抓取)正则表达式 (多线程协作)

1.多线程调用界面后台代码


using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using HraWeb.Common;
using WebApp.Common;
using Contract.Domain;
using System.Collections;
using System.IO;
using System.Net;
using System.Runtime.Serialization.Json;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using Common;
using Elmah;
using Framework;
using ThreadTemplate;
using System.Diagnostics;

namespace HraWeb
{

    public partial class CcrCompanyManage : JQEntityManage<Contract.Domain.CcrCompanyFundamental>
    {
        private Spring.Caching.ICache cache;
        private Spring.Caching.ICache SpringCache
        {
            get
            {
                if (cache == null)
                    cache = (Spring.Caching.ICache)ctx.GetObject("AspNetCache");
                return cache;
            }
            set { cache = value; }
        }
      
        public static Dictionary<int, IList<CcrCompanyFundamental>> dic = new Dictionary<int, IList<CcrCompanyFundamental>>();
        private void ConstractionPrePageIndexData(int pageIndex, int pageSize, IList<CcrCompanyFundamental> funmentals)
        {
            int start = (pageIndex - 1) * pageSize + 1;
            int end = start + pageSize - 1;
            for (int i = start; i <= end; i++)
            {
                if (!dic.ContainsKey(pageIndex))
                {
                    dic.Add(pageIndex, new List<CcrCompanyFundamental>() { });

                }
                dic[pageIndex].Add(funmentals[i - 1]);

            }

 

        }

        private ArrayList GetRatingInfo()
        {
            //Dao2 = GetDao();
            IList<CcrCreditScoreInfo> temps = Holworth.Utility.HraUtility.ListToT<CcrCreditScoreInfo>(Dao.FindList(new QueryInfo("CcrCreditScoreInfo")));
            List<int?> fids = temps.Select(x => x.FundamentalId).ToList();
            ArrayList ratingClassPaimingStates = new ArrayList();
            QueryInfo finfo = new QueryInfo("CcrCompanyFundamental");
            WebClient wc = new WebClient();
            IList<CcrCompanyFundamental> funmentals = Holworth.Utility.HraUtility.ListToT<CcrCompanyFundamental>(Dao.FindList(finfo));
            funmentals = (from f in funmentals where !fids.Contains(int.Parse(f.Id)) select f).ToList();
            int pageIndex = 1;
            int pageSize = 1;
            int totalCount = funmentals.Count;//总记录数,亦总开线程数
            int totalPage = (totalCount - 1) / pageSize + 1;//
            //处理前面的n-1页的数据
            for (pageIndex = 1; pageIndex < totalPage; pageIndex++)
            {
                ConstractionPrePageIndexData(pageIndex, pageSize, funmentals);


            }
            //处理最后一页的数据
            int LastFirstOne = (pageIndex - 1) * pageSize + 1;

            for (int j = LastFirstOne; j <= totalCount; j++)
            {
                if (!dic.ContainsKey(pageIndex))
                {
                    dic.Add(pageIndex, new List<CcrCompanyFundamental>());
                }

                dic[pageIndex].Add(funmentals[j - 1]);
            }
            //多线程的最大并发数
            int maxPoolThread = 100;
            int totalThreadNum = dic.Count;
            //当前正在运行的线程
            var runingHt = new Dictionary<int, clsSubThread>();
            //处于等待队列的未运行的线程
            var unRunHt = new Dictionary<int, clsSubThread>();
    
            //选取maxPoolThread个线程加入运行队列,其余放入未运行的等待队列 
            for (int i = 1; i <= totalThreadNum; i++)
            {
                clsSubThread th = new clsSubThread(i, dic[i]);
             
                if (i <= maxPoolThread)
                {
                    runingHt.Add(i,th);
                    th.Start();
                }
                else
                {
                    unRunHt.Add(i, th);
                }
            }
           
            
            while (true)
            {
                //初始化完成队列,用于存取已经执行完的线程的id
                var stepFinishList = new List<int>();
                
               //将完成的线程放入完成队列
                foreach (int tid in runingHt.Keys)
                {
                    var t = runingHt[tid];
                    if (t.IsStopped)
                    {
                        stepFinishList.Add(tid);
                    }
                }
              //1.遍历完成队列,从当前运行的线程队列中移除该线程
              //2.对完成的线程执行回调,将数据持久化到数据库
              //3.如果等待队列中还有数据,获取等待队列中的第一个,并执行该线程,将该线程从等待队列移除,加入到运行队列

                foreach (int tid in stepFinishList)
                {
                    Thread t1 = new Thread(new ParameterizedThreadStart(SaveOrUpdate));
                    t1.Start(runingHt[tid].ReturnList);
                    runingHt.Remove(tid);

                    if (unRunHt.Count > 0)
                    {
                        clsSubThread unRunThread = unRunHt.First().Value;
                        var unRunTid= unRunHt.First().Key;
                        unRunThread.Start();
                        runingHt.Add(unRunTid, unRunThread);
                        unRunHt.Remove(unRunTid);
                    }
                }

                //所有线程都完成后,跳出循环
                if (runingHt.Count == 0 && unRunHt.Count == 0)
                {
                    break;
                }


            }

          
            return ratingClassPaimingStates;

        }

       ///线程完成之后的回调动作,将返回的List保存到数据库
        private void SaveOrUpdate(object o)
        {
            IList list = (IList)o;
            Dao.SaveOrUpdateAll(list);
        }

     
   

        protected override void Page_Load(object sender, EventArgs e)
        {
          
            base.Page_Load(sender, e);
        }


    
        ///click事件抓取网页信息,通过多线程协作插入到数据表
        protected void Button1_Click(object sender, EventArgs e)
        {
            GetRatingInfo();
        }

        
    }
}

2.封装的线程类


using System.Collections.Generic;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using Contract.Domain;
using System.Web;
namespace ThreadTemplate
{
    using System;
    using System.Threading;
    using System.IO;
    /// <summary>
    /// Summary description for clsSubThread.
    /// </summary>
    public class clsSubThread : IDisposable
    {
        private Thread thdSubThread = null;
        private Mutex mUnique = new Mutex();

        private bool blnIsStopped;
        private bool blnSuspended;
        private bool blnStarted;
        private IList<CcrCompanyFundamental> clist;
        private int threadId;

        public int ThreadId { get { return threadId; } set { threadId = value; } }
        private IList<CcrCreditScoreInfo> paiming; 
        public bool IsStopped
        {
            get { return blnIsStopped; }
        }
        public bool IsSuspended
        {
            get { return blnSuspended; }
        }
        public IList<CcrCreditScoreInfo> ReturnList
        {
            get { return paiming; }
         
        }

        public IList<CcrCompanyFundamental> CList
        {
            set { clist = value; }
        }


        public clsSubThread(int key,IList<CcrCompanyFundamental> pclist)
        {
            threadId = key;
            paiming=new List<CcrCreditScoreInfo>();
            //
            // TODO: Add constructor logic here
            //
            blnIsStopped = true;
            blnSuspended = false;
            blnStarted = false;

          
            clist = pclist;
        }

        /// <summary>
        /// Start sub-thread
        /// </summary>
        public void Start()
        {
            if (!blnStarted)
            {
                thdSubThread = new Thread(new ThreadStart(SubThread));
                blnIsStopped = false;
                blnStarted = true;
                thdSubThread.Start();
            }
        }

        /// <summary>
        /// Thread entry function 线程执行方法,从网站中用正则表达式,抓取需要的数据
        /// </summary>
        private void SubThread()
        {
            paiming = new List<CcrCreditScoreInfo>();
            
              WebClient wc=new WebClient();
           // do
            {
              
                for (int i = 0; i < clist.Count; i++)
                {
                    CcrCompanyFundamental company = clist[i];
                    CcrCreditScoreInfo c = new CcrCreditScoreInfo();
                    c.FundamentalId = int.Parse(company.Id);
                    c.CompanyName = company.CompanyName;

                    string keyword = "湖南艾华集团股份有限公司";
                    using (Stream stream = wc.OpenRead("http://bgcheck.cn/MemberCenter/FirmCredit/Search.html?Keywords=" + keyword))
                    {
                        using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                        {
                            string content = sr.ReadToEnd();
                            string ratingClasspatern = @"(?<=\[信用等级:([\s\S]*?)<a(.*)?[^>]*?>)([\s\S]*?)(?=</a>)";
                            string ratingSequencepatern = @"(?<=信用排名:([\s\S]*?)<span(.*)?[^>]*?>)([\s\S]*?)(?=</span>)";
                            string ratingStatepatern = @"(?<=信用状况:([\s\S]*?)<span(.*)?[^>]*?>)([\s\S]*?)(?=</span>)";
                            MatchCollection ratingClassmatches = Regex.Matches(content, ratingClasspatern);
                            MatchCollection ratingSequencematches = Regex.Matches(content, ratingSequencepatern);
                            MatchCollection ratingStatematches = Regex.Matches(content, ratingStatepatern);
                            string ratingClass = string.Empty;
                            string ratingSequence = string.Empty;
                            string ratingState = string.Empty;
                            foreach (Match match in ratingClassmatches)
                            {
                                ratingClass = match.Groups[0].Value;
                                break;
                            }
                            foreach (Match match in ratingSequencematches)
                            {
                                ratingSequence = match.Groups[0].Value;
                                break;
                            }
                            foreach (Match match in ratingStatematches)
                            {
                                ratingState = match.Groups[0].Value;
                                break;
                            }
                            c.RatingClass = ratingClass;
                            c.RatingSequence = ratingSequence;
                            c.RatingState = ratingState;
                            paiming.Add(c);
                        }
                    }
                }

                // Release CPU here
            }
             this.Stop();
            //while (blnIsStopped == false);
        }

        /// <summary>
        /// Suspend sub-thread
        /// </summary>
        public void Suspend()
        {
            if (blnStarted && !blnSuspended)
            {
                blnSuspended = true;
                mUnique.WaitOne();
            }
        }

        /// <summary>
        /// Resume sub-thread
        /// </summary>
        public void Resume()
        {
            if (blnStarted && blnSuspended)
            { 
                blnSuspended = false;
                mUnique.ReleaseMutex();
            }
        }

        /// <summary>
        /// Stop sub-thread
        /// </summary>
        public void Stop()
        {
            if (blnStarted)
            {
                if (blnSuspended)
                    Resume();

                blnStarted = false;
                blnIsStopped = true;
                thdSubThread.Join();
            }
        }
        #region IDisposable Members
        /// <summary>
        /// Class resources dispose here
        /// </summary>
        public void Dispose()
        {
            // TODO:  Add clsSubThread.Dispose implementation
            Stop();//Stop thread first
            GC.SuppressFinalize(this);
        }

        #endregion
    }
}

 

posted on 2016-01-08 23:43  听哥哥的话  阅读(642)  评论(0编辑  收藏  举报

导航