两道面试题,欢迎高手来指正

一、用ASP.NET实现正则抓取匹配 www.163.com 首页的财经专栏

要求:提取出连接和标题

时间:1小时内

注:首先,需要动态获取http://www.163.com 的内容,接着获取到网易的首页内容后,用正则进行匹配.得到想要的内容.,把标题和链接保存为数组,

我的完成:

 

代码
using System;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Net;
using System.IO;

using System.Text;
using System.Text.RegularExpressions;

public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{

}
protected void btn163_Click(object sender, EventArgs e)
{
string str = GetSiteContent();
int index = str.IndexOf("<a href=\"http://money.163.com/blog/\">财经专栏</a>");
str = str.Substring(index);
index
= str.IndexOf("健康专栏");
str
= str.Substring(0, index);
Regex re
= new Regex(@"<a[\s]+href=(?<Link>[^\s>]+)[^>]*>(?<Text>[^<]*)</a>");
MatchCollection mc
= re.Matches(str);
string[] strList = new string[mc.Count];
StringBuilder sb
= new StringBuilder();
for (int n = 0; n < mc.Count; n++)
{
strList[n]
= mc[n].Groups[0].ToString();
sb.AppendFormat(
"{0}</br>", strList[n]);
}
this.Label1.Text = sb.ToString();
}

private string GetSiteContent()
{
WebRequest request
= WebRequest.Create("http://www.163.com");
WebResponse response
= request.GetResponse();
Stream stream
= response.GetResponseStream();
StreamReader reader
= new StreamReader(stream, Encoding.Default);
string str = reader.ReadToEnd();
reader.Close();
stream.Close();
response.Close();
return str;
}
}

 

二、有一个文本,内容为

http://www.test.com/info.html
http://www.test.com/info3.html
http://www.test.com/info2.html
http://www.test.com/info1.html
http://www.test.com/info2.html
http://www.test.com/info1.html

 

要求结果为去除重复项,按出现次数进行排序

我的完成:

 

代码
using System;
using System.Collections;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.IO;
using System.Text;

namespace WebApplication1
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{

}

protected void Button1_Click(object sender, EventArgs e)
{
string[] strList1 = GetStringList();
var tt
= strList1.Distinct();
string[] strList2 = tt.ToArray();
int[] num = new int[strList2.Length];
for (int m = 0; m < tt.Count(); m++)
{
num[m]
= strList1.Count((s) => s == strList2[m]);
}
int t1, t2;
string str1, str2;
for(int n=0;n<num.Length;n++)
{
for (int t = 0; t < num.Length-1; t++)
{
t1
= num[t];
t2
= num[t + 1];
if (t2 > t1)
{
num[t]
= t2;
num[t
+ 1] = t1;
str1
= strList2[t];
str2
= strList2[t+1];
strList2[t]
= str2;
strList2[t
+ 1] = str1;
}
}
}

strList1.ToString();

}

private string[] GetStringList()
{
StreamReader reader
= new StreamReader(this.File1.PostedFile.InputStream, Encoding.Default);
string str = reader.ReadToEnd();
reader.Close();

string[] strList = str.Split("\r\n".ToCharArray());
ArrayList list
= new ArrayList();
foreach (string s in strList)
{
if (s.Length > 0)
{
list.Add(s);
}
}
string[] strList2 = new string[list.Count];
for (int t = 0; t < list.Count; t++)
{
strList2[t]
= list[t].ToString();

}
return strList2;
}

}
}

 

posted @ 2010-05-23 19:08  Jesong  阅读(355)  评论(1编辑  收藏  举报