GO 爬虫链接
1. 正则爬取手机号
结果:
代码:
1 package main 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net/http" 7 "os" 8 "regexp" 9 ) 10 11 var ( 12 rePhone = `(1[3456789]\d)(\d{4})(\d{4})` 13 ) 14 15 func HandleError(err error, when string) { 16 if err != nil { 17 fmt.Println(when, err) 18 os.Exit(1) 19 } 20 } 21 22 func main() { 23 //http get请求页面 24 resp, err := http.Get("https://www.haomagujia.com/") 25 26 //处理报错 27 HandleError(err, "http.Get") 28 29 //读取整体获取的页面内容 30 bytes, _ := ioutil.ReadAll(resp.Body) 31 html := string(bytes) 32 //fmt.Println(html) 33 34 //使用正则表达式对象在网页中过滤出手机号信息 35 re := regexp.MustCompile(rePhone) 36 //-1 代表匹配全部 37 allString := re.FindAllStringSubmatch(html, -1) 38 for _, x := range allString { 39 fmt.Println(x) 40 } 41 42 }
2. 正则爬邮箱
结果:
代码
1 package main 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net/http" 7 "os" 8 "regexp" 9 ) 10 11 var ( 12 reEmail = `[\w\.]+@\w+\.[a-z]{2,3}(\.[a-z]{2,3})?` 13 ) 14 15 func HandleError(err error, when string) { 16 if err != nil { 17 fmt.Println(when, err) 18 os.Exit(1) 19 } 20 } 21 22 func main() { 23 24 html := GetHtml("https://www.douban.com/group/topic/41562980/") 25 26 //模拟邮箱数据 27 html += "xiaoming@163.com\n" 28 html += "aaa@126.com\n" 29 html += "22223@qq.com\n" 30 html += "x.badt@gmail.com\n" 31 32 re := regexp.MustCompile(reEmail) 33 AallString := re.FindAllStringSubmatch(html, -1) 34 for _, x := range AallString { 35 fmt.Println(x) 36 } 37 38 } 39 40 func GetHtml(url string) string { 41 resp, err := http.Get(url) 42 HandleError(err, "http.Get") 43 bytes, _ := ioutil.ReadAll(resp.Body) 44 html := string(bytes) 45 return html 46 }
3. 正则爬超链接
代码:
1 package main 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net/http" 7 "os" 8 "regexp" 9 ) 10 11 var ( 12 reLink = `<a[\s\S]+?href="(http[\s\S+?])"` //需要根据实际情况书写规则 13 ) 14 15 func HandleError(err error, when string) { 16 if err != nil { 17 fmt.Println(when, err) 18 os.Exit(1) 19 } 20 } 21 22 func main() { 23 24 html := GetHtml("https://www.hao123.com") 25 26 //爬取超链接 27 re := regexp.MustCompile(reLink) 28 AallString := re.FindAllStringSubmatch(html, -1) 29 for _, x := range AallString { 30 fmt.Println(x[0]) 31 } 32 33 } 34 35 func GetHtml(url string) string { 36 resp, err := http.Get(url) 37 HandleError(err, "http.Get") 38 bytes, _ := ioutil.ReadAll(resp.Body) 39 html := string(bytes) 40 return html 41 }
3. 正则爬身份证号
代码:
1 package main 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "net/http" 7 "os" 8 "regexp" 9 ) 10 11 var ( 12 //3-30625-1970-04-26-0474 13 //reLink = `[1-6]\d{5}-(19\d{2})|(20(0\d)|(1[0-8]))-(0[1-9])|(1[012])-((0[1-9])|([12]\d)|(3[01]))-\d{3}[\dx]` //需要根据实际情况书写规则 14 reLink = `[1-6]\d{5}((19\d{2})|(20((0\d)|(1[0-8]))))(0[1-9])|(1[012])((0[1-9])|([12]\d)|(3[01]))\d{3}[\dx]` //需要根据实际情况书写规则 15 16 ) 17 18 func HandleError(err error, when string) { 19 if err != nil { 20 fmt.Println(when, err) 21 os.Exit(1) 22 } 23 } 24 25 func main() { 26 27 html := GetHtml("http://www.shaoxing.com.cn/p/2771751.html") 28 29 re := regexp.MustCompile(reLink) 30 AallString := re.FindAllStringSubmatch(html, -1) 31 for _, x := range AallString { 32 fmt.Println(x[0]) 33 } 34 35 } 36 37 func GetHtml(url string) string { 38 resp, err := http.Get(url) 39 HandleError(err, "http.Get") 40 bytes, _ := ioutil.ReadAll(resp.Body) 41 html := string(bytes) 42 return html 43 }
完毕