Daily scrum 11.6

今日任务统计:

 

燃尽图:

张永强同学的处理html并提取数据的代码:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using commons;
using HtmlAgilityPack;
using System.IO;

namespace HTMLParser
{
public class HTMLParser : IHtmlParser
{
CommonPageInfo page;
HtmlDocument htmlDocument;

public HTMLParser()
{
page = new CommonPageInfo();
}

public void readHtmlDoc(string fileName)
{
StreamReader sr = new StreamReader(fileName);
htmlDocument = new HtmlDocument();
htmlDocument.Load(sr); // 读入html文件
}

/**
* get the page title
*/
public void genPageTitle()
{
string title = null ;
HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes("/html/head/title");
foreach (HtmlNode node in nodes)
{
title += node.InnerText.Trim();
title += ";";// 如果可能有多个title字段,那么就在这个地方在每个部分最后加上一个分号
}
page.Title = title;
}

/**
* generate the page description from the html document
* 如果网页的源代码中已经包含了description的话,直接将信息放到网页的descrption中
*/
public void getPageDescription()
{
string description = null;
HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes("/html/head/meta");
foreach (HtmlNode node in nodes)
{
bool hasDescription = false;
foreach (HtmlAttribute attr in node.Attributes)
{
if (attr == null)
break;
else if(attr.Name.ToLower() == "description")
{
hasDescription = true;
}

if (hasDescription && (attr.Name == "content"))
{
description += attr.Value;
hasDescription = false;
}
}
}
page.Description = description;
}

/*
* generate the key words
* 有的网页中可能已经包含了keywords信息,所以就直接提取下来
* 加入到keywords的列表中去,这个后面具体看是否适合
*/
public void getPageKeywords()
{
string str = null;
HtmlNodeCollection nodes = htmlDocument.DocumentNode.SelectNodes("/html/head/meta");
foreach (HtmlNode node in nodes)
{
bool hasDescription = false;
foreach (HtmlAttribute attr in node.Attributes)
{
if (attr == null)
break;
else if (attr.Name.ToLower() == "keywords")
{
hasDescription = true;
}

if (hasDescription && (attr.Name == "content"))
{
str += attr.Value.Trim();
hasDescription = false;
}
}
}
string[] words = str.Split(' ');
foreach (string word in words)
{
page.KeyWords.Add(word);
}
}
}
}

posted on 2012-11-06 23:22  fightingsnail1  阅读(240)  评论(1编辑  收藏  举报

导航