C#运用实例.读取csv里面的词条,对每一个词条抓取百度百科相关资料,然后存取到数据库

 

第一步:首先需要将csv先装换成datatable,这样我们就容易进行对datatable进行遍历:

  /// 将CSV文件的数据读取到DataTable中
    /// CSV文件路径
    /// 返回读取了CSV数据的DataTable
    public DataTable OpenCSV(string fileName)
    {
        DataTable dt = new DataTable();
        FileStream fs = new FileStream(fileName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
        StreamReader sr = new StreamReader(fs, System.Text.Encoding.Default);
        //记录每次读取的一行记录
        string strLine = "";
        //记录每行记录中的各字段内容
        string[] aryLine;
        //标示列数
        int columnCount = 0;
        //标示是否是读取的第一行
        bool IsFirst = true;
        //逐行读取CSV中的数据
        while ((strLine = sr.ReadLine()) != null)
        {
            aryLine = strLine.Split(',');
            if (IsFirst == true)
            {
                IsFirst = false;
                columnCount = aryLine.Length;
                //创建列
                for (int i = 0; i < columnCount; i++)
                {
                    DataColumn dc = new DataColumn(aryLine[i]);
                    dt.Columns.Add(dc);
                }
            }
            else
            {
                DataRow dr = dt.NewRow();
                for (int j = 0; j < columnCount; j++)
                {
                    dr[j] = aryLine[j];
                }
                dt.Rows.Add(dr);
            }
        }
        sr.Close();
        fs.Close();
        return dt;
    }
将csv文件转换成datatable

特别备注:遍历dadatable的几种方式已经将datatable装换成csv文件:

一、将DataTable内容写入到CSV文件
        ///

        /// 将DataTable中的数据保存成CSV文件
        ///

        private void btnSaveCSV_Click(object sender, EventArgs e)
        {
            saveFileDialog1.Filter = "CSV文件|*.CSV";
            saveFileDialog1.InitialDirectory = "C:\\";
            if (saveFileDialog1.ShowDialog() == DialogResult.Cancel)
            {
                return;
            }
            else
            {
                string fileName = saveFileDialog1.FileName;
                SaveCSV(ds.Tables[0], fileName);
            }
        }
 
        ///

        /// 将DataTable中数据写入到CSV文件中
        ///

        /// 提供保存数据的DataTable
        /// CSV的文件路径
        public void SaveCSV(DataTable dt, string fileName)
        {
            FileStream fs = new FileStream(fileName, System.IO.FileMode.Create, System.IO.FileAccess.Write);
            StreamWriter sw = new StreamWriter(fs, System.Text.Encoding.Default);
            string data = "";
            //写出列名称
            for (int i = 0; i < dt.Columns.Count; i++)
            {
                data += dt.Columns[i].ColumnName.ToString();
                if (i < dt.Columns.Count - 1)
                {
                    data += ",";
                }
            }
            sw.WriteLine(data);
            //写出各行数据
            for (int i = 0; i < dt.Rows.Count; i++)
            {
                data = "";
                for (int j = 0; j < dt.Columns.Count; j++)
                {
                    data += dt.Rows[i][j].ToString();
                    if (j < dt.Columns.Count - 1)
                    {
                        data += ",";
                    }
                }
                sw.WriteLine(data);
            }
            sw.Close();
            fs.Close();
            MessageBox.Show("CSV文件保存成功!");
        }
 
将datatable转换成csv
遍历datatable的方法2009-09-08 10:02方法一:   
DataTable dt = dataSet.Tables[0];   
for(int i = 0 ; i < dt.Rows.Count ; i++)   
{   
   string strName = dt.Rows[i]["字段名"].ToString();   
}   
  
方法二:   
foreach(DataRow myRow in myDataSet.Tables["temp"].Rows)   
{   
      var str = myRow[0].ToString();   
}   
  
方法三:   
foeach(DataRow dr in dt.Rows)      
{      
     object value = dr["ColumnsName"];      
}   
  
方法四:   
DataTable dt=new DataTable();      
foreach(DataRow dr in dt.Rows)      
{    
   for(int i=0;i<dt.Columns.Count;i++)    
   {    
dr[i];    
   }      
}   
  
 
方法五
DataRow[] dataRows = null;
dataRows = dataTable.Select(fieldParentID + "='" + treeNode.Tag.ToString() + "'", dataTable.DefaultView.Sort);
foreach (DataRow dataRow in dataRows)
{ 

     DataRow dataRow = dataTable.Rows[i]; 

     ?? = dataRow[fieldParentID].ToString();
} 
遍历datatable的几种方式

第二步呢: 就是必须用到post方式提交啦,

    /// <summary>
    /// post数据请求,返回html代码
    /// </summary>
    /// <param name="areaName">输入百科词条</param>
    /// <returns></returns>
    public string UrlPost(string areaName)
    {
        Encoding encoding = Encoding.GetEncoding("UTF-8");
        Stream outstream = null;
        Stream instream = null;
        StreamReader sr = null;

        string url = "http://baike.baidu.com/search/word?word=" + areaName;

        HttpWebRequest request = null;

        HttpWebResponse response = null;


        // 准备请求,设置参数

        request = WebRequest.Create(url) as HttpWebRequest;

        request.Method = "POST";

        //request.ContentType = "application/x-www-form-urlencoded";
        request.ContentType = "text/xml";//注意了,上面的那种方式不可采用,因为当遇到xml格式的就会报远程服务器错误。500


        byte[] data = encoding.GetBytes(url);

        request.ContentLength = data.Length;

        outstream = request.GetRequestStream();

        outstream.Write(data, 0, data.Length);

        outstream.Flush();

        outstream.Close();

        //发送请求并获取相应回应数据


        response = request.GetResponse() as HttpWebResponse;

        //直到request.GetResponse()程序才开始向目标网页发送Post请求

        instream = response.GetResponseStream();

        sr = new StreamReader(instream, encoding);

        //返回结果网页(html)代码

        string content = sr.ReadToEnd();
        return content;

    }
post方式请求数据返回html数据

 第三部,就是遍历datatable,然后遍历出每个关键字,以post方式提交到百度百科,返回html,然后用正则表达式进行对数据的筛选

    /// <summary>
    /// 点击按钮执行程序
    /// </summary>
    /// <param name="sender"></param>
    /// <param name="e"></param>
    protected void btn_click_Click(object sender, EventArgs e)
    {
        DataTable dts = OpenCSV("C:\\Users\\kingtopinfo\\Desktop\\CITYLOCATION.csv");
       
        foreach (DataRow dr in dts.Rows)
        {
            string sql = "";
            string regstr = "";
            string regspit = "";
            string Title, Content = "";
            int Id = Convert.ToInt32(dr["ID"].ToString());
            int CityId = Convert.ToInt32(dr["CITYID"].ToString());
            decimal Latitude = Convert.ToDecimal(dr["LATITUDE"]);
            decimal Longitude = Convert.ToDecimal(dr["LONGITUDE"]);
            //提交数据给页面并返回html
            string value = dr["CITYNAME"].ToString();
            string html = UrlPost(value);
            //对数据进行处理。
            Regex reg = new Regex(@"<div class=""biItemInner"">(\s|\S)+?(<\/div>\s*?)");
            MatchCollection mc = reg.Matches(html);
            foreach (Match m in mc)
            {
                regspit += Regex.Replace(Regex.Replace(m.Value, @"</div[^>]*>", ":"), @"</span[^>]*>", ",");
               
            }
            regstr = Regex.Replace(regspit, @"</?div[^>]*>|</?span[^>]*>|</?a[^>]*>|&nbsp;", "");
            string[] area = Regex.Split(regstr, ":", RegexOptions.IgnoreCase);
            foreach (string i in area)
            {
                string[] areaspit = i.Split(',');
                if (areaspit.Length==2)
                {
                    Title = areaspit[0];
                    Content = areaspit[1];
                    SqlParameter[] paras = { 
                                           new SqlParameter("@Id", Id),
                                           new SqlParameter("@cityId", CityId),
                                           new SqlParameter("@latitude", Latitude),
                                           new SqlParameter("@longitude", Longitude),
                                           new SqlParameter("@cityName", value),
                                           new SqlParameter("@title", Title),
                                           new SqlParameter("@content", Content ),
                                           };
                   
                    sql = "INSERT INTO chinaarea(ID,CITYID,LATITUDE,LONGITUDE,CITYNAME,TITLE,CONTENT) VALUES (@Id,@cityId,@latitude,@longitude,@cityName,@title,@content)";
                    SaveSql(sql,paras);
               
                }
                   
                 
            }
        }
    }
遍历表格,发送请求,正则筛选,存入数据库

第四部分,就是数据库的操作了

const string connString = @"Data Source =.;Database = Area;Integrated Security=True";

  /// <summary>
    /// 存入数据库
    /// </summary>
    /// <param name="str"></param>
    public void SaveSql(string str,SqlParameter[] para)
    {
        SqlConnection conn = new SqlConnection(connString);
        conn.Open();
        string mysql = str;
        SqlCommand cmd = new SqlCommand(mysql, conn);
        cmd.Parameters.AddRange(para);
        cmd.ExecuteNonQuery();
        conn.Close();
    }
数据库操作

 此外,这里对正则表达式进行解释

 因为返回的都是这种类型的数据,而我们需要得到的是里面的内容,所以采取正则匹配和替换

<div class="biItem"><div class="biItemInner"><span class="biTitle">中文名称</span><div class="biContent">北京</div></div></div>

<div class="biItem"><div class="biItemInner"><span class="biTitle">外文名称</span><div class="biContent">Beijing</div></div></div>

 Regex reg = new Regex(@"<div class=""biItemInner"">(\s|\S)+?(<\/div>\s*?)");     找出<div class="biItemInner"><div>里的内容

Regex.Replace(Regex.Replace(m.Value, @"</div[^>]*>", ":"), @"</span[^>]*>", ","); 对div和span进行替换和以:分组(便于之后数据处理)

Regex.Replace(regspit, @"</?div[^>]*>|</?span[^>]*>|</?a[^>]*>|&nbsp;", ""); 将regspit里面&nbsp 还有a标签进行替换(也就是去掉里面的这些标签)

 

最终的代码整合:

 

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Data;
using System.Data.SqlClient;
using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
public partial class csv : System.Web.UI.Page
{
    const string connString = @"Data Source =.;Database = Area;Integrated Security=True";
    protected void Page_Load(object sender, EventArgs e)
    {
       
    }
    /// <summary>
    /// 点击按钮执行程序
    /// </summary>
    /// <param name="sender"></param>
    /// <param name="e"></param>
    protected void btn_click_Click(object sender, EventArgs e)
    {
        DataTable dts = OpenCSV("C:\\Users\\kingtopinfo\\Desktop\\CITYLOCATION.csv");
       
        foreach (DataRow dr in dts.Rows)
        {
            string sql = "";
            string regstr = "";
            string regspit = "";
            string Title, Content = "";
            int Id = Convert.ToInt32(dr["ID"].ToString());
            int CityId = Convert.ToInt32(dr["CITYID"].ToString());
            decimal Latitude = Convert.ToDecimal(dr["LATITUDE"]);
            decimal Longitude = Convert.ToDecimal(dr["LONGITUDE"]);
            //提交数据给页面并返回html
            string value = dr["CITYNAME"].ToString();
            string html = UrlPost(value);
            //对数据进行处理。
            Regex reg = new Regex(@"<div class=""biItemInner"">(\s|\S)+?(<\/div>\s*?)");
            MatchCollection mc = reg.Matches(html);
            foreach (Match m in mc)
            {
                regspit += Regex.Replace(Regex.Replace(m.Value, @"</div[^>]*>", ":"), @"</span[^>]*>", ",");
               
            }
            regstr = Regex.Replace(regspit, @"</?div[^>]*>|</?span[^>]*>|</?a[^>]*>|&nbsp;", "");
            string[] area = Regex.Split(regstr, ":", RegexOptions.IgnoreCase);
            foreach (string i in area)
            {
                string[] areaspit = i.Split(',');
                if (areaspit.Length==2)
                {
                    Title = areaspit[0];
                    Content = areaspit[1];
                    SqlParameter[] paras = { 
                                           new SqlParameter("@Id", Id),
                                           new SqlParameter("@cityId", CityId),
                                           new SqlParameter("@latitude", Latitude),
                                           new SqlParameter("@longitude", Longitude),
                                           new SqlParameter("@cityName", value),
                                           new SqlParameter("@title", Title),
                                           new SqlParameter("@content", Content ),
                                           };
                   
                    sql = "INSERT INTO chinaarea(ID,CITYID,LATITUDE,LONGITUDE,CITYNAME,TITLE,CONTENT) VALUES (@Id,@cityId,@latitude,@longitude,@cityName,@title,@content)";
                    SaveSql(sql,paras);
               
                }
                   
                 
            }
        }
    }

    /// 将CSV文件的数据读取到DataTable中
    /// CSV文件路径
    /// 返回读取了CSV数据的DataTable
    public DataTable OpenCSV(string fileName)
    {
        DataTable dt = new DataTable();
        FileStream fs = new FileStream(fileName, System.IO.FileMode.Open, System.IO.FileAccess.Read);
        StreamReader sr = new StreamReader(fs, System.Text.Encoding.Default);
        //记录每次读取的一行记录
        string strLine = "";
        //记录每行记录中的各字段内容
        string[] aryLine;
        //标示列数
        int columnCount = 0;
        //标示是否是读取的第一行
        bool IsFirst = true;
        //逐行读取CSV中的数据
        while ((strLine = sr.ReadLine()) != null)
        {
            aryLine = strLine.Split(',');
            if (IsFirst == true)
            {
                IsFirst = false;
                columnCount = aryLine.Length;
                //创建列
                for (int i = 0; i < columnCount; i++)
                {
                    DataColumn dc = new DataColumn(aryLine[i]);
                    dt.Columns.Add(dc);
                }
            }
            else
            {
                DataRow dr = dt.NewRow();
                for (int j = 0; j < columnCount; j++)
                {
                    dr[j] = aryLine[j];
                }
                dt.Rows.Add(dr);
            }
        }
        sr.Close();
        fs.Close();
        return dt;
    }

    /// <summary>
    /// 存入数据库
    /// </summary>
    /// <param name="str"></param>
    public void SaveSql(string str,SqlParameter[] para)
    {
        SqlConnection conn = new SqlConnection(connString);
        conn.Open();
        string mysql = str;
        SqlCommand cmd = new SqlCommand(mysql, conn);
        cmd.Parameters.AddRange(para);
        cmd.ExecuteNonQuery();
        conn.Close();
    }

    /// <summary>
    /// post数据请求,返回html代码
    /// </summary>
    /// <param name="areaName">输入百科词条</param>
    /// <returns></returns>
    public string UrlPost(string areaName)
    {
        Encoding encoding = Encoding.GetEncoding("UTF-8");
        Stream outstream = null;
        Stream instream = null;
        StreamReader sr = null;

        string url = "http://baike.baidu.com/search/word?word=" + areaName;

        HttpWebRequest request = null;

        HttpWebResponse response = null;


        // 准备请求,设置参数

        request = WebRequest.Create(url) as HttpWebRequest;

        request.Method = "POST";

        //request.ContentType = "application/x-www-form-urlencoded";
        request.ContentType = "text/xml";//注意了,上面的那种方式不可采用,因为当遇到xml格式的就会报远程服务器错误。500


        byte[] data = encoding.GetBytes(url);

        request.ContentLength = data.Length;

        outstream = request.GetRequestStream();

        outstream.Write(data, 0, data.Length);

        outstream.Flush();

        outstream.Close();

        //发送请求并获取相应回应数据





        response = request.GetResponse() as HttpWebResponse;

        //直到request.GetResponse()程序才开始向目标网页发送Post请求

        instream = response.GetResponseStream();

        sr = new StreamReader(instream, encoding);

        //返回结果网页(html)代码

        string content = sr.ReadToEnd();
        return content;

    }

    

}
View Code

 

posted @ 2015-07-17 18:02  sucer德  阅读(248)  评论(0编辑  收藏  举报