C# 新浪微博滚动抓取 WeiboGrab

应该先说,本来相对网页加载的程序段进行规范的,但是,当再次编写的时候发现,还是不能很好的掌握网页加载的具体规则,导致获取页面的代码还是很繁杂。其他部分改的差不多了,还有就是当微博中的字符含有{}等时,会提示字符串格式错误,这个也该需要改进的,,还没改进,程序还需要一个挂空线程的功能,保留现场,让程序可以继续爬取,而不是从头再爬。
各种类
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using HtmlAgilityPack;

namespace WeiBoGrab
{
    class WeiBoGrabClass
    {
    }
    
    public class GetPage
    {
        //加载初始页面
        public string GetLoginPage(WebBrowser browser)
        {
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }
            while (browser.Document.GetElementById("pl_login_form").InnerHtml == null)
            {
                Application.DoEvents();
            }
            return "加载登陆页面完成。";
        }
       //加载用户主页
        public string GetMainPage(WebBrowser browser)
        {
            while (browser.DocumentTitle != "我的首页 新浪微博-随时随地分享身边的新鲜事儿")
            {
                Application.DoEvents();
            }
             //确保加载完所需内容
            while (browser.Document.GetElementById("pl_rightmod_myinfo")!=null&&
                browser.Document.GetElementById("pl_rightmod_myinfo").Children.Count < 2)
            {
                Application.DoEvents();
            }
            
            return "加载个人主页完成。";
        }
       //加载用户关注对象的第一页
        public string GetFollowsPage(WebBrowser browser)
        {
            while (browser.DocumentTitle != "我关注的人 新浪微博-随时随地分享身边的新鲜事儿")
            {
                Application.DoEvents();
            }
            while (browser.Document.GetElementById("pl_relation_myfollow") == null)
            {
                Application.DoEvents();
            }
            while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3)
            {
                Application.DoEvents();
            }
            return "关注对象页面第一页加载完成。";
        }
        //加载用户关注对象的下一页
        public string GetFollowsNextPage(WebBrowser browser)
        {
            //将原页面的关注对象列表清空(关注对象列表为children[2].children[1])
            //加载新页面3=browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count
            //不明白,孩子个数显示明明是3,但是述操作却正确。。。 
            //browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4
            //<!--  -->此类标签有时会被当做标签计数或提取,需要实际分析

            while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3||
                   browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4)
            {
                Application.DoEvents();
            }
            //当上述条件满足后,再加载,便是新生成的内容
            return "关注对象下一页加载完成。";
        }
        //加载关注对象的主页的第一页
        public string GetFollowMainPage(WebBrowser browser)
        {
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }

            //当微博是杂志、新闻类时
            if (browser.Document.GetElementById("epfeedlist") != null)
            {
                while (browser.Document.GetElementById("feed_list") == null)
                {
                    Application.DoEvents();
                }
                return "关注对象主页第一页加载完成。";
            }
            //当微博是个人、媒体类时
            if (browser.Document.GetElementById("pl_content_hisFeed") == null)
            {
                while (browser.Document.GetElementById("profileFeed").InnerHtml == null)
                {
                    Application.DoEvents();
                }
            }
            while (browser.Document.GetElementById("pl_content_hisFeed").InnerHtml == null)
            {
                Application.DoEvents();
            }
            //找到feed
            HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").Children;
            int feed_postion = 0;
            //有的微博页面需要此步骤
            while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加载,请稍候..." ||
                browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加载中,请稍候...")
            {
                Application.DoEvents();
            }
            //pl_content_hisFeed加载不全
            while (browser.Document.GetElementById("pl_content_hisFeed").Children.Count < 2)
            {
                Application.DoEvents();
            }
            foreach (HtmlElement p in ps)
            {
                if (p.GetAttribute("node-type") != null && p.GetAttribute("node-type") == "feed_list")
                {
                    break;
                }
                else
                    feed_postion++;
            }
            //非第一页加载时,有此等待
            while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加载中,请稍候..."
                   || browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加载,请稍候...")
            {
                Application.DoEvents();
            }
            //微博数量及等待加载模块所在位置表示
            int hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1;
            //表示正在加载
            bool loading = true;
            //找出加载模块位置
            HtmlElement load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count];
            int i;
            for (i = 1; (i < 10) && (hisFeed_count - i >= 0); i++)
            {
                if (load.InnerText == "正在加载中,请稍候..." || load.InnerText == "正在加载,请稍候...")
                    break;
                load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i];
            }
            while (loading)
            {
                loading = false;
                load.ScrollIntoView(false);
                while (load.InnerText == "正在加载中,请稍候..." || load.InnerText == "正在加载,请稍候...")
                {
                    load.ScrollIntoView(false);
                    Application.DoEvents();
                    load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i];
                }
                //微博加载
                //限制次数,limit有待商榷,过小会使有的微博可能会加载失败
                int Limit = 100;
                int L = 0;
                while ((browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count < hisFeed_count + 2)&&
                    (L < Limit ))
                {
                    L++;//防止无限加载的等待
                    Application.DoEvents();
                }
                //更新加载模块位置
                hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1;
                //更新加载模块
                load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count];
                for (int j = 1; (j < 10) && (hisFeed_count - j >= 0); j++)//假设无效的标签数不超过10个
                {
                    if (load.InnerText == "正在加载中,请稍候..." || load.InnerText == "正在加载,请稍候...")
                        break;
                    load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - j];
                }
                if (load != null && (load.InnerText == "正在加载中,请稍候..." || load.InnerText == "正在加载,请稍候..."))
                {
                    loading = true;
                    load.ScrollIntoView(false);
                }
            }
            return "加载关注对象主页第一页面完成。";

        }
        //加载关注对象的的主页的下一页
        public string GetFollowMainNextPage(WebBrowser browser)
        {
            Application.DoEvents();
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }
            GetFollowMainPage(browser);
            //针对杂志、新闻类微博
            if (browser.Document.GetElementById("epfeedlist") == null)
                Application.DoEvents();
            return "加载关注对象后续页面完成。";
        }
    }
    //用户登陆类
    public class LoginSubmit
    {
        private string username;
        private string password;
        //初始化登陆对象
        public LoginSubmit(string username, string password)
        {
            this.username = username;
            this.password = password;
        }
        //点击登陆
        public void LoginClick(WebBrowser browser)
        {
            //登陆页面的登陆模块
            HtmlElement pl_login_form = browser.Document.GetElementById("pl_login_form");
            //登陆模块中的用户名_INPUT
            HtmlElement pl_login_form_username = pl_login_form.GetElementsByTagName("INPUT")[0];
            //让用户名输入框获取焦点(目的清空输入框)
            pl_login_form_username.InvokeMember("click");
            pl_login_form_username.SetAttribute("value",username);

            //登陆模块的密码_INPUT
            HtmlElement pl_login_form_password = pl_login_form.GetElementsByTagName("INPUT")[1];
            //让密码输入框获取焦点(目的清空输入框)
            pl_login_form_password.InvokeMember("click");
            pl_login_form_password.SetAttribute("value",password);

            //找到登陆按钮并点击
            HtmlElementCollection IsClick = pl_login_form.GetElementsByTagName("span");
            foreach (HtmlElement Click in IsClick)
            {
                if (Click.GetAttribute("node-type") != null && Click.GetAttribute("node-type") == "submitStates")
                {
                    Click.InvokeMember("click");
                    break;
                }
            }
        }
    }
    //将关注对象设为一类
    public class Follow
    {
        //获取关注对象(点击用户关注对象的超链接)
        public void GetFollows(WebBrowser browser)
        {
            //获取用户的信息模块
            HtmlElement pl_rightmod_myinfo = browser.Document.GetElementById("pl_rightmod_myinfo");
            //获取关注对象子模块
            HtmlElement my_info_follow = pl_rightmod_myinfo.GetElementsByTagName("strong")[0];
            if (my_info_follow.GetAttribute("node-type") == "follow")
            {
                //判断用户是否有关注对象
                if (my_info_follow.InnerText == "0")
                    return;
                my_info_follow.InvokeMember("click");
                GetPage getfollowpage = new GetPage();
                getfollowpage.GetFollowsPage(browser);
            }
        }
        //获取关注对象的url,并写到txt中
        public void GetFollowsUrl(WebBrowser browser, StreamWriter sw)
        {
            //是否还有下一页
            bool Next = true;
            int UrlCount = 0;
            while (Next)
            {
                //默认没有下一页
                Next = false;

                HtmlElement FollowLinks = browser.Document.GetElementById("pl_relation_myfollow");
                HtmlElementCollection Links = FollowLinks.GetElementsByTagName("div");

                foreach (HtmlElement Link in Links)
                {
                    if (Link.GetAttribute("action-type") == "ignore_list")
                    {
                        HtmlNode href = HtmlNode.CreateNode(Link.InnerHtml);

                        string url = href.Attributes["href"].Value;
                        string followname = href.FirstChild.Attributes["alt"].Value;

                        sw.WriteLine("No.{0}|{1}|{2}", ++UrlCount, followname, url);
                    }
                }
                HtmlElementCollection pages = FollowLinks.GetElementsByTagName("span");

                //判断是否有下一页
                foreach (HtmlElement page in pages)
                {
                    if (page.InnerText == "下一页")
                    {
                        Next = true;
                        page.InvokeMember("click");
                        
                        //Console.WriteLine("这个标签是:"+browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml);
                        browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml = null;
                        
                        //载入关注对象页面的下一页
                        GetPage GetNext = new GetPage();
                        GetNext.GetFollowsNextPage(browser);
                        break;
                    }
                }
            }
            sw.Close();
        }
    }
    //将微博设为一类
    public class WeiBo
    {
        private string FollowName;
        private string FollowUrl;

        public WeiBo(string FollowName, string FollowUrl)
        {
            this.FollowName = FollowName;
            this.FollowUrl = FollowUrl;
        }
        public void GetWeiBo(WebBrowser browser)
        {
            StreamWriter sw = File.CreateText("D:\\weibo\\" + FollowName + ".txt");
            bool Next = true;
            int WeiBoCount = 0;
            browser.Navigate(new Uri(@FollowUrl));
            GetPage GetNext = new GetPage();
            GetNext.GetFollowMainPage(browser);
            //默认还没登记此类微博
            string Kind = "N";

            HtmlElement epfeedlist = browser.Document.GetElementById("epfeedlist");
            HtmlElement pl_content_hisFeed = browser.Document.GetElementById("pl_content_hisFeed");
            if (pl_content_hisFeed != null)
            {
                //媒体类微博的pl_content_hisFeed.Children[1].Children[0].TagName = "dl"
                
                //个人微博的pl_content_hisFeed.Children[1].OuterHtml =<!-- /高级搜索 -->
                if (pl_content_hisFeed.Children[1].Children.Count != 0)
                        //媒体(小)微博
                        Kind = "M";

                    //个人微博                
                else
                     Kind = "P";
            }
            if (epfeedlist != null)
                //杂志,新闻等微博
                Kind = "J";
            while (Next)
            {
                Next = false;
                switch (Kind)
                {
                    case "P": 
                        {
                            //爬取各条微博
                          HtmlElementCollection divs = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("div");
                          foreach (HtmlElement div in divs)
                           {
                            if (div.GetAttribute("node-type") == "feed_list_content")
                                sw.WriteLine("第{0}条|" + div.InnerText, ++WeiBoCount);
                           }
                            //判断是否还有下一页
                          HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span");
                          foreach (HtmlElement span in spans)
                          {
                              if (span.InnerText == "下一页")
                              {
                                  span.InvokeMember("click");
                                  Next = true;
                                  GetNext.GetFollowMainNextPage(browser);
                                  break;
                              }
                          }
                        }break;
                    case "J":
                        {
                            //爬取各条微博
                            int count_li = browser.Document.GetElementById("feed_list").Children.Count;
                            for (int i = 0; i < count_li; i++)
                            {
                                sw.WriteLine("第{0}条|" + browser.Document.GetElementById("feed_list").Children[i].GetElementsByTagName("p")[0].InnerText, ++WeiBoCount);
                            }
                            //判断是否还有下一页
                            HtmlElementCollection ems = browser.Document.GetElementById("feed_list").NextSibling.GetElementsByTagName("em");
                            int end = ems.Count;
                            if (ems[end - 1].InnerText == "下一页")
                            {
                                ems[end - 1].InvokeMember("click");
                                browser.Document.GetElementById("feed_list").OuterHtml = null;
                                GetNext.GetFollowMainNextPage(browser);
                                Next = true;
                            }
                        }break;
                    case "M":
                        {
                            HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("p");
                            foreach (HtmlElement p in ps)
                            {
                                if (p.GetAttribute("node-type") == "feed_list_content")
                                    sw.WriteLine("第{0}条|" + p.InnerText, ++WeiBoCount);
                            }
                            //判断是否还有下一页
                            HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span");
                            foreach (HtmlElement span in spans)
                            {
                                if (span.InnerText == "下一页")
                                {
                                    span.InvokeMember("click");
                                    Next = true;
                                    GetNext.GetFollowMainNextPage(browser);
                                    break;
                                }
                            }

                        }break;
                    default: return;//还没记录的微博
                }
            }
            sw.Close();
        }
    }
}
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;

namespace WeiBoGrab
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string username = textBox1.Text.ToString();
            string password = textBox2.Text.ToString();
            string url = "http://weibo.com/";
            GetPage getpage = new GetPage();
            StreamWriter sw = File.CreateText("FollowUrl.txt");
            WebBrowser browser = webBrowser1;

            browser.Navigate(new Uri(@url));
            //加载登陆页面
            textBox3.Text += getpage.GetLoginPage(browser);
            //登陆操作
            LoginSubmit loginsubmit = new LoginSubmit(username, password);
            loginsubmit.LoginClick(browser);
            //加载个人主页
            textBox3.Text += getpage.GetMainPage(browser);
            //获取关注对象
            Follow follow = new Follow();
            follow.GetFollows(browser);                     
            follow.GetFollowsUrl(browser,sw);

            FileStream fs = new FileStream("FollowUrl.txt",FileMode.Open);
            StreamReader sr = new StreamReader(fs);
            string s;
            while ((s = sr.ReadLine()) != null)
            {
                string[] arry = s.Split('|');
                string name = arry[1];
                string user_url = arry[2];
                WeiBo feed = new WeiBo(name, user_url);
                feed.GetWeiBo(browser);
            }
            sr.Close();
        }
    }
}

 

posted @ 2013-05-25 10:09  idealing  阅读(3424)  评论(0编辑  收藏  举报