go爬虫

package main

import (
	"fmt"
	"github.com/antchfx/htmlquery"
	"io/ioutil"
	"net/http"
	"os"
	"regexp"
	"strings"
	"sync"
	"time"
)
var wg sync.WaitGroup
func main() {

	var url string = "https://haomooc.com/xiaoxue-read-2991.html"

	resp, _ := http.Get(url)

	defer resp.Body.Close()

	doc, _ := htmlquery.Parse(resp.Body)
	list := htmlquery.Find(doc, "//div[@class='dxs-l-b']//a")
	for _, li := range list {
		href := htmlquery.SelectAttr(li, "href")
		strings.Replace(href, " ", "", -1)
		title := htmlquery.SelectAttr(li,"title")
		strings.Replace(title, " ", "", -1)
		fmt.Printf("%s\n", title)
		fmt.Printf("%s\n", href)
		video := getVideo(href)
		wg.Add(1)
		//saveVideo(title,video)
		fmt.Printf("%s\n", video)
	}

	wg.Wait()
}

func getVideo(url string) string  {
	time.Sleep(time.Second*1)
	resp, _ := http.Get(url)

	bytesContent, _ := ioutil.ReadAll(resp.Body)

	var reEmail = `(https://video.haomooc.com/.*.mp4)`

	re := regexp.MustCompile(reEmail)
	list := re.FindAllStringSubmatch(string(bytesContent), -1)

	var result string

	for _,v := range list {
		if v != nil{
			result = v[1]
		}
	}
	strings.Replace(result, " ", "", -1)
	return result
}
func PathExists(path string) (bool, error) {

	_, err := os.Stat(path)
	if err == nil {
		return true, nil
	}
	if os.IsNotExist(err) {
		return false, nil
	}
	return false, err
}

func saveVideo(title string ,url string)  {

	fmt.Printf(title,url)
	path := "/www/shell/video/"+title+".mp4"
	b, err := PathExists(path)
	if err != nil {
		fmt.Printf("PathExists(%s),err(%v)\n", path, err)
	}
	if b {
		fmt.Printf("path %s 存在\n", path)
	} else{
		fmt.Println("save video "+title )
		fmt.Printf("%s",url)
		// Get the data
		resp, err := http.Get(url)
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()

		data, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			panic(err)
		}
		ioutil.WriteFile(path, data, 0644)
		defer wg.Done()
	}


}
posted @ 2020-11-28 23:53  brady-wang  阅读(84)  评论(0编辑  收藏  举报