colly爬虫 go

package main

import (
	"bufio"
	"fmt"
	"github.com/antchfx/htmlquery"
	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"regexp"
	"strings"
	"sync"
	"time"
)
var wg sync.WaitGroup
var ch chan int

func main() {
	ch = make(chan int ,10)
	var reNotAllow = `http://www.uidzhx.com/du/.*.html`
	c := colly.NewCollector(
		colly.AllowedDomains("www.uidzhx.com"),
		colly.AllowURLRevisit(),
		colly.IgnoreRobotsTxt(),
		colly.DisallowedURLFilters(regexp.MustCompile(reNotAllow)),
		)
	c.AllowURLRevisit = false
	c.Async = false
	extensions.RandomUserAgent(c)
	extensions.Referer(c)

	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})


	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		//Parallelism: 2,
		RandomDelay: 1 * time.Second,
	})
	
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		link := e.Attr("href")
		c.Visit(e.Request.AbsoluteURL(link))
	})

	//收到响应后
	c.OnResponse(func(r *colly.Response) {

		doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
		if err != nil {
			log.Fatal(err)
		}
		title := htmlquery.FindOne(doc, `/html/body/div[4]/div[2]/div[1]/div/div[2]/div/h1`)
		if title != nil{
			var reTxt = `http://dzs.uidzhx.com.*\.txt`
			re := regexp.MustCompile(reTxt)
			url := re.FindString(string(r.Body))
			if url != "" {
				txtTitle := strings.Replace(htmlquery.InnerText(title), " ", "", -1)
				fmt.Println(txtTitle)
				wg.Add(1)
				ch <- 1
				//go saveTxt(txtTitle,url)
			}
		}
	})


	c.Visit("http://www.uidzhx.com/Shtml89401.html")

	wg.Wait()
}

func saveTxt(title string,url string )  {
	defer wg.Done()
	str := download(url)
	fmt.Println(str)
	fmt.Printf("save txt %s - %s\n",title,url)
	filePath := "d:/crawl/"+title+".txt"
	file, err := os.OpenFile(filePath, os.O_WRONLY | os.O_CREATE, 0666)
	if err != nil {
		fmt.Printf("open file err=%v\n", err)
		return
	}
	//及时关闭file句柄
	defer file.Close()
	//写入时,使用带缓存的 *Writer
	writer := bufio.NewWriter(file)
	for i := 0; i < 5; i++ {
		writer.WriteString(str)
	}
	<- ch
}

func download(url string) string {
	client := &http.Client{}
	req,_ := http.NewRequest("GET",url,nil)

	req.Header.Set("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)")
	resp,err := client.Do(req)
	if err != nil{
		fmt.Print("http get err",err)
		panic("http get err")
	}

	defer resp.Body.Close()

	body,err := ioutil.ReadAll(resp.Body)
	if err != nil{
		fmt.Print("read error ",err)
		panic("read error")
	}
	return string(body)
}


posted @ 2020-11-19 15:05  brady-wang  阅读(421)  评论(1编辑  收藏  举报