1、找出url汇总页，过滤出满足条件的详情页url；2、去详情页采集信息

package main

import (
   "fmt"

   "github.com/gocolly/colly"
   "regexp"
   "strings"

   "github.com/mongodb/mongo-go-driver/mongo"
   "github.com/mongodb/mongo-go-driver/bson"

   "context"
   "log"
   "gopkg.in/mgo.v2"
   mgoBson "gopkg.in/mgo.v2/bson"
   "time"
   "math/rand"
)

/*
task
http://www.cnhan.com/hyzx/
http://www.cnhan.com/shantui/
http://www.cnhan.com/pinfo/
http://www.heze.cn/info
http://www.heze.cn/qiye/
http://cn.sonhoo.com/wukong/
采集站点当日更新数据的客户联系方式

20180828
站点日期字符串特征
"cnhan.com/hyzx/":    "20180902",
"cnhan.com/shantui/": "20180902",
"cnhan.com/pinfo/":   "2018.09.02",
"heze.cn/info/":      "2018.09.02",
"heze.cn/qiye/":      "2018.09.02",
"sonhoo.com/wukong/": "2018-09-02",

20180904
http://cn.sonhoo.com/wukong/c16?offset=600&limit=50 先去文章含有文章日期的列表页遍历出符合条件的文章url，
再去文章详情页http://cn.sonhoo.com/wukong/a213383采集客户资料

*/

type PotentialCustomerWebSiteUrl struct {
   Url string `bson:"url"`
}

// Configuration | Colly http://go-colly.org/docs/introduction/configuration/
const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

func RandomString() string {
   b := make([]byte, rand.Intn(10)+10)
   for i := range b {
      b[i] = letterBytes[rand.Intn(len(letterBytes))]
   }
   return string(b)
}

// url汇总页的日期筛选方法
func eggSitePathListTargetDate(TargetDate string) map[string]string {
   SitePathListTargetDateFmt := map[string]string{
      "cnhan.com/hyzx/":    "",
      "cnhan.com/shantui/": "ONLYtoday",
      "cnhan.com/pinfo/":   ".",
      "heze.cn/info/":      ".",
      "heze.cn/qiye/":      ".",
      "sonhoo.com/wukong/": "-",
   }
   for k, v := range SitePathListTargetDateFmt {
      fmt.Println(k)
      if ( v == "") {
         SitePathListTargetDateFmt[k] = TargetDate
      } else {
         SitePathListTargetDateFmt[k] = TargetDate[0:4] + v + TargetDate[4:6] + v + TargetDate[6:8]
      }
   }
   fmt.Println(SitePathListTargetDateFmt)
   return SitePathListTargetDateFmt
}

//  指定日期数据采集
var TargetDate = "20180904"
var TodayDate = time.Now().Format("20060102")
var mongoCollectioName = "todayUrls0904TestData"
var SitePathListTargetDate = eggSitePathListTargetDate(TargetDate)

func getTargetDateSpideredUrl() []string {
   //查询mongodb数据
   session, err := mgo.Dial("mongodb://hbaseU:123@192.168.3.103:27017/hbase")
   if err != nil {
      panic(err)
   }
   defer session.Close()
   // Optional. Switch the session to a monotonic behavior.
   session.SetMode(mgo.Monotonic, true)
   Collection := session.DB("hbase").C(mongoCollectioName)
   var PotentialCustomerWebSiteUrls [] PotentialCustomerWebSiteUrl
   err = Collection.Find(mgoBson.M{"spiderDate": TargetDate}).All(&PotentialCustomerWebSiteUrls)
   if err != nil {
      log.Fatal(err)
   }
   var PotentialCustomerWebSiteUrlSet [] string
   for i, v := range PotentialCustomerWebSiteUrls {
      fmt.Println(i)
      fmt.Println(v.Url)
      PotentialCustomerWebSiteUrlSet = append(PotentialCustomerWebSiteUrlSet, v.Url)
   }
   return PotentialCustomerWebSiteUrlSet
}

// 检查元素是否存在于数组？遍历？如何集合运算方法
func eleInArr(ele string, arr [] string) bool {
   for _, v := range arr {
      if (ele == v) {
         fmt.Println("eleInArr", ele)
         return true
      }
   }
   return false
}

func getTargetDateUrls() []string {
   var targetDateUrls []string
   PotentialCustomerWebSiteUrlSet := getTargetDateSpideredUrl()
   c := colly.NewCollector()
   if (TargetDate == TodayDate) {
      // 路径下只有当日url
      // Instantiate default collector
      c = colly.NewCollector(
         colly.AllowedDomains("www.cnhan.com"),
      )
      // On every a element which has href attribute call callback
      // 类选择器
      //url仅在本页
      c.OnHTML(".showSort a[href]", func(e *colly.HTMLElement) {

         link := e.Attr("href")
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      })
      // Start scraping on http://www.cnhan.com/shantui/
      c.Visit("http://www.cnhan.com/shantui/")

      // 起始路由改变
      // Instantiate default collector
      c = colly.NewCollector(
         colly.AllowedDomains("www.cnhan.com"),
         colly.URLFilters(
            //请求页面的正则表达式，满足其一即可
            //http://www.cnhan.com/hyzx/
            //http://www.cnhan.com/hyzx/index-all-2.html
            //硬代码：目标日最多更新99页http://www.cnhan.com/hyzx/index-all-99.html
            //^[1-9][0-9]{0,1}[^0-9]{0,1}$
            regexp.MustCompile("^http://www.cnhan.com/hyzx/(.{0}$)|(index-all-[1-9][0-9]{0,1}[^0-9]{0,1}\\.html$)"),
         ),
      )

      // On every a element which has href attribute call callback
      c.OnHTML("a[href]", func(e *colly.HTMLElement) {
         link := e.Attr("href")
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         c.Visit(e.Request.AbsoluteURL(link))
         d := SitePathListTargetDate["cnhan.com/hyzx/"]
         reg := regexp.MustCompile(d) // http://www.cnhan.com/hyzx/20180827/7109076.html 通过url格式过滤出目标日的url
         data := reg.Find([]byte(link))
         regRes := len(data)
         if regRes > 0 {
            link = "http://www.cnhan.com/hyzx/" + link
            t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
            if (!t) {
               targetDateUrls = append(targetDateUrls, link)
               fmt.Printf("Link found: %q -> %s\n", e.Text, link)
            }
         }
      })
      // Before making a request print "Visiting ..."
      c.OnRequest(func(r *colly.Request) {
         fmt.Println("Visiting", r.URL.String())
      })
      // Start scraping on http://www.cnhan.com/shantui/
      c.Visit("http://www.cnhan.com/hyzx/")

   }

   // 起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.cnhan.com"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         //http://www.cnhan.com/pinfo/
         //http://www.cnhan.com/pinfo/index-5.html
         //硬代码：目标日最多更新99页http://www.cnhan.com/pinfo/index-99.html
         regexp.MustCompile("^http://www.cnhan.com/pinfo/(.{0}$)|(index-[1-9][0-9]{0,1}[^0-9]{0,1}\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      //文本过滤
      eDate := e.ChildText(".span2")
      //http://www.cnhan.com/pinfo/313257.html   周口水泥彩砖具有的特色是什么2018.08.27
      d := SitePathListTargetDate["cnhan.com/pinfo/"]
      if (strings.Contains(eDate, d)) {
         link := e.Attr("href")
         link = "http://www.cnhan.com" + link
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      }
   })
   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })
   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.cnhan.com/pinfo/")

   // 起始路由改变
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.heze.cn"),
   )
   // On every a element which has href attribute call callback
   // 类选择器
   c.OnHTML(".news_list_r a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      targetDateUrls = append(targetDateUrls, link)
   })
   // Start scraping on http://www.cnhan.com/shantui/
   c.Visit("http://www.heze.cn/info/")

   /*
   站内目标url
   http://www.heze.cn/info/
   http://www.heze.cn/qiye/
   检测思路：
   1、按父url，分别进入 http://www.heze.cn/qiye/18240670888/show-37-1367148.html  http://www.heze.cn/info/LEbigong/show-1-13931879.html
      与2反
   2、按照全站进入
      优点：过滤规则简单，代码代码简单；爬取结果数据不便于分类处理，比如产品类型、发布时间；
      缺点：爬爬取速度慢
   */

   // 起始路由改变
   //http://www.heze.cn/qiye/  该页面、其主体子页面，刷新，内容变化
   //http://www.heze.cn/qiye/list-8.html
   // Instantiate default collector
   c = colly.NewCollector(
      colly.AllowedDomains("www.heze.cn"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         regexp.MustCompile("^http://www.heze.cn/qiye/(.{0}$)|(list-\\d+-\\d+\\.html$)"),
      ),
   )
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
      // http://www.heze.cn/qiye/hongfei688/show-44-14825619.html
      reg := regexp.MustCompile("^http://www.heze.cn/qiye/[0-9a-zA-Z]+/show-\\d+-\\d+\\.html$")
      data := reg.Find([]byte(link))
      regRes := len(data)
      if regRes > 0 {
         fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      }
   })
   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
   })
   // Start scraping on http://www.heze.cn/qiye/
   c.Visit("http://www.heze.cn/qiye/")

   // 起始路由改变
   // 目标日页面部分存在于起始页且与飞目标日页面混杂，无法通过选择器判断，需解析文本
   // 全站过滤
   // 类目页 http://cn.sonhoo.com/wukong/c133
   // 文章页 http://cn.sonhoo.com/wukong/a191114
   c = colly.NewCollector(
      colly.AllowedDomains("cn.sonhoo.com"),
      colly.URLFilters(
         //请求页面的正则表达式，满足其一即可
         regexp.MustCompile("^http://cn.sonhoo.com/wukong/$"),
         //regexp.MustCompile("^http://cn.sonhoo.com/wukong/[ac]{1}\\d+$"),
         regexp.MustCompile("^http://cn.sonhoo.com/wukong/[c]{1}\\d+$"),
         // http://cn.sonhoo.com/wukong/c0?offset=150&limit=50 文章列表页
         regexp.MustCompile("^http://cn.sonhoo.com/wukong/c\\d+\\?offset=\\d+\\&limit=\\d+$"),
      ),
      // 不加UA，无数据
      // colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
   )
   // 限制线程数，引入随机延迟
   // Limit the number of threads started by colly to two
   // when visiting links which domains' matches "*httpbin.*" glob
   c.Limit(&colly.LimitRule{
      DomainGlob:  "*sonhoo.*",
      Parallelism: 5,
      RandomDelay: 5 * time.Second,
   })

   // 保证遍历http://cn.sonhoo.com/wukong/c4?offset=100&limit=50各个页面的，获取可能的目标日的url
   // On every a element which has href attribute call callback
   c.OnHTML("a[href]", func(e *colly.HTMLElement) {
      link := e.Attr("href")
      fmt.Printf("Link found: %q -> %s\n", e.Text, link)
      c.Visit(e.Request.AbsoluteURL(link))
   })

   // 目标日的url入库
   // div[class=page-articles__articles]>ul>li
   c.OnHTML("div[class=page-articles__articles]>ul>li", func(e *colly.HTMLElement) {
      link := e.ChildAttr("a", "href")
      dat1 := e.ChildText("span")
      fmt.Println(dat1)
      fmt.Println(link)
      pageDate := e.ChildText("span")
      d := SitePathListTargetDate["sonhoo.com/wukong/"]
      fmt.Println("tDate", d)
      fmt.Println("pageDate", pageDate)
      if (strings.Contains(pageDate, d)) {
         link = "http://cn.sonhoo.com" + link
         t := eleInArr(link, PotentialCustomerWebSiteUrlSet)
         if (!t) {
            targetDateUrls = append(targetDateUrls, link)
            fmt.Printf("Link found: %q -> %s\n", e.Text, link)
         }
      }
      c.Visit(e.Request.AbsoluteURL(link))
   })

   c.OnScraped(func(r *colly.Response) {
      fmt.Println("Finished", r.Request.URL)
   })
   // Before making a request print "Visiting ..."
   c.OnRequest(func(r *colly.Request) {
      fmt.Println("Visiting", r.URL.String())
      r.Headers.Set("User-Agent", RandomString())
   })
   c.OnError(func(_ *colly.Response, err error) {
      log.Println("Something went wrong:", err)
   })
   c.OnResponse(func(r *colly.Response) {
      fmt.Println("Visited", r.Request.URL)
   })
   // Start scraping on
   // http://cn.sonhoo.com/wukong/ 不带日期文章列表页
   // http://cn.sonhoo.com/wukong/c0 20180904 带日期的文章列表页：共50页，没有l类目
   c.Visit("http://cn.sonhoo.com/wukong/c0")
   // 等待线程结束
   // Wait until threads are finished
   c.Wait()

   return targetDateUrls
}

func main() {
   var targetDateUrls = getTargetDateUrls()
   fmt.Println(targetDateUrls)
   fmt.Println(len(targetDateUrls))
   for _, targetDateUrl := range targetDateUrls {
      // Instantiate default collector
      c := colly.NewCollector()
      // On every a element which has href attribute call callback
      c.OnScraped(func(r *colly.Response) {
         reqUrl := fmt.Sprintln(r.Request.URL)
         strings.Replace(reqUrl, "\n", "", -1)
         wholePageHtml := string(r.Body)
         client, err := mongo.Connect(context.Background(), "mongodb://hbaseU:123@192.168.3.103:27017/hbase", nil)
         db := client.Database("hbase")
         coll := db.Collection(mongoCollectioName)
         if err != nil {
            fmt.Println(err)
         }
         //目标日多次采集，目标日同站点且同路径url不重复入库
         //存入时间戳，分析目标站点的信息更新规律
         result, err := coll.InsertOne(
            context.Background(),
            bson.NewDocument(
               bson.EC.String("spiderDate", TargetDate),
               bson.EC.String("url", reqUrl),
               bson.EC.String("html", wholePageHtml),
            ))
         fmt.Println(err)
         fmt.Println(result)
      })
      c.Visit(targetDateUrl)
   }
}
posted @ 2018-09-04 11:45 papering 阅读(604) 评论(0) 收藏举报
刷新页面返回顶部