采集新闻
面向对象的方式做采集程序
采集新闻
方便扩展
存储到xml
步骤:
1、找对象 文章视为对象 每一个网站视为对象
2、为了方便扩展做类似于计算器的操作
把采集的网站视为对象,所有的网站都能够采集 和保存成xml
所以抽象出父类WebSite 抽象类
实现具体的子类cnbeta sina等
3、WebSite 抽象类{ Name(网站名字 只读) Path xml保存路径 Url采集的url,抽象方法 Load采集新闻,Save把新闻保存到xml中}
4、cnbeta 继承WebSite{ }
donews
5、窗体加载时候根据反射读取每个继承自WebSite的子类的名字,添加到下拉框中
6、点采集按钮时候。根据下拉框中的内容创建具体的子类,执行采集方法
7、点保存按钮的时候 把采集到的新闻集合,存储在xml中
cnBate
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.Text.RegularExpressions; using System.Xml; namespace 采集程序3 { class cnBate : WebSite { public override string name { get { return "cnBate"; } } List<Article> articles = new List<Article>(); string regex = @"<div\s+class=""newslist"">\s+<dl>.+?<strong>(?<title>.+?)</strong></a>.+?<span>(?<author>.+?)发布于\s+(?<time>\d{4}\-\d{2}\-\d{2}\s+?\d{2}:\d{2}:\d{2}).+?</a>.+?<span>(?<content>.+?)</span></dd>"; public override List<Article> Load() { WebClient wc = new WebClient(); using (Stream stream = wc.OpenRead(base.Url)) { using (StreamReader sr = new StreamReader(stream, Encoding.GetEncoding("gb2312"))) { string content; while (!string.IsNullOrEmpty((content = sr.ReadToEnd()))) { MatchCollection mc = Regex.Matches(content, regex, RegexOptions.Singleline); foreach (Match match in mc) { if (match.Success) { Article article = new Article(); article.Title = match.Groups["title"].Value; article.Author = match.Groups["author"].Value; article.Content = match.Groups["content"].Value; article.Content = Regex.Replace(article.Content, "<.+?>", ""); article.Time = DateTime.Parse(match.Groups["time"].Value); articles.Add(article); } } } } } return articles; } public override void Save() { if (!File.Exists(base.Path)) { CreateXml(); } else { AddXml(); } } public void CreateXml() { XmlDocument doc = new XmlDocument(); XmlDeclaration declaration = doc.CreateXmlDeclaration("1.0", "utf-8", null); doc.AppendChild(declaration); XmlElement parent = doc.CreateElement("News"); doc.AppendChild(parent); foreach (Article item in articles) { XmlElement child = doc.CreateElement("New"); parent.AppendChild(child); CreateItems(doc, child, item.Title, "Title"); CreateItems(doc, child, item.Author, "Author"); CreateItems(doc, child, item.Content, "Content"); CreateItems(doc, child, item.Time.ToString(), "Time"); } doc.Save(base.Path); } public void AddXml() { XmlDocument doc = new XmlDocument(); doc.Load(base.Path); XmlElement parent = doc.DocumentElement; foreach (Article item in articles) { XmlElement child = doc.CreateElement("New"); parent.AppendChild(child); CreateItems(doc, child, item.Title, "Title"); CreateItems(doc, child, item.Author, "Author"); CreateItems(doc, child, item.Content, "Content"); CreateItems(doc, child, item.Time.ToString(), "Time"); } doc.Save(base.Path); } private static void CreateItems(XmlDocument doc, XmlElement child, string item, string str) { XmlElement title = doc.CreateElement(str); title.InnerText = item; child.AppendChild(title); } } }
WebSite
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace 采集程序3 { public abstract class WebSite { public abstract string name { get; } private string url; public string Url { get { return url; } set { url = value; } } private string path; public string Path { get { return path; } set { path = value; } } public abstract List<Article> Load(); public abstract void Save(); } }
Article
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace 采集程序3 { public class Article { private string title; public string Title { get { return title; } set { title = value; } } private string author; public string Author { get { return author; } set { author = value; } } private string content; public string Content { get { return content; } set { content = value; } } private DateTime time; public DateTime Time { get { return time; } set { time = value; } } } }
Fectory
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace 采集程序3 { class Fectory { public static WebSite CreateObj(string type) { WebSite site = null; switch (type) { case "cnBate": site = new cnBate(); site.Path = "cnBate.xml"; site.Url = @"http://www.cnbeta.com/"; break; } return site; } } }
Form1
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.Reflection; namespace 采集程序3 { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void Form1_Load(object sender, EventArgs e) { //使用反射动态添加列表项,方便扩展 Assembly ass = sender.GetType().Assembly; Type[] types= ass.GetTypes(); foreach (Type type in types) { if (typeof(WebSite).IsAssignableFrom(type) && !type.IsAbstract) { WebSite ws = Activator.CreateInstance(type) as WebSite; comboBox1.Items.Add(ws.name); } } } WebSite ws; private void btnLoad_Click(object sender, EventArgs e) { ws = Fectory.CreateObj(comboBox1.Text); if (ws != null) { List<Article> articles = new List<Article>(); articles = ws.Load(); foreach (Article item in articles) { ListViewItem lvi = new ListViewItem(item.Title); lvi.SubItems.Add(item.Author); lvi.SubItems.Add(item.Content); lvi.SubItems.Add(item.Time.ToString()); listView1.Items.Add(lvi); } } else { MessageBox.Show("该选项不存在"); } } private void btnSave_Click(object sender, EventArgs e) { ws.Save(); MessageBox.Show("保存成功"); } private void listView1_DoubleClick(object sender, EventArgs e) { MessageBox.Show(listView1.SelectedItems[0].SubItems[0].Text); } } }