java爬虫,爬取网址、爬取视频、爬取图片
一、爬取网址
import java.io.*; import java.net.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * java实现爬虫 */ public class Robot { public static void main(String[] args) { URL url = null; URLConnection urlconn = null; BufferedReader br = null; PrintWriter pw = null; // String regex = "http://[\\w+\\.?/?]+\\.[A-Za-z]+"; String regex = "https://[\\w+\\.?/?]+\\.[A-Za-z]+";//url匹配规则 Pattern p = Pattern.compile(regex); try { url = new URL("https://www.cnblogs.com/peachh/p/9740229.html");//爬取的网址、这里爬取的是一个生物网站 urlconn = url.openConnection(); pw = new PrintWriter(new FileWriter("C:/SiteURL.txt"), true);//将爬取到的链接放到D盘的SiteURL文件中 br = new BufferedReader(new InputStreamReader( urlconn.getInputStream())); String buf = null; while ((buf = br.readLine()) != null) { Matcher buf_m = p.matcher(buf); while (buf_m.find()) { pw.println(buf_m.group()); } } System.out.println("爬取成功^_^"); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { br.close(); } catch (IOException e) { e.printStackTrace(); } pw.close(); } } }
二、爬取视频
import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 功能:爬取某姐的小视频 * @author cxd * */ public class WebSpiderDemo1 { public static void main(String[] args) throws Exception { String source = "http://www.budejie.com/video/"; String destDir = "C:/rob/"; Map<String, String> urlMap = getUrlInSource(source); for (Map.Entry<String, String> entry : urlMap.entrySet()) { String title = entry.getKey();// 视频名称 String url = entry.getValue();// 视频url File destFile = new File(destDir + title + ".mp4"); download(url, destFile); } } /** * 通过视频的URL下载该视频并存入本地 * * @param url 视频的URL * @param destFile 视频存入的位置 * @throws IOException */ public static void download(String url, File destFile) throws IOException { URL videoUrl = new URL(url); InputStream is = videoUrl.openStream(); FileOutputStream fos = new FileOutputStream(destFile); int len = 0; byte[] buffer = new byte[1024]; while ((-1) != (len = is.read(buffer))) { fos.write(buffer, 0, len); } fos.flush(); if (null != fos) { fos.close(); } if (null != is) { is.close(); } } /** * 获取视频的URL地址和视频名称存入hashMap * * @param source * @return * @throws IOException */ public static Map<String, String> getUrlInSource(String source) throws IOException { Map<String, String> hashMap = new HashMap<>(); for (int index = 1; index <= 1; index++) { // 页数最大为50,自己玩嘛,就只爬取了一页。 String pageUrl = source + index; URL url = new URL(pageUrl); InputStream is = url.openStream(); // 若遇到反爬机制则使用该方法将程序伪装为浏览器进行访问 // HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // conn.setRequestMethod("GET"); // conn.setRequestProperty("user-agent", // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"); // BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String info = null; String title = null; // 此处不要用==null进行判断,因为网页中有很多行都是null,否则会报java.lang.NullPointerException。 for (int i = 0; i < 10000; i++) { info = br.readLine(); if (null != info) { String urlRegex = "data-mp4=\"(.*?\\.mp4)"; if (info.contains("data-title")) { title = info; } Pattern pattern = Pattern.compile(urlRegex); Matcher matcher = pattern.matcher(info); if (matcher.find()) { for (int j = 0; j <= matcher.groupCount(); j++) { String tmp = matcher.group(j); if (!tmp.startsWith("data-mp4=")) { String videoTitle = getTitle(title.trim()); hashMap.put(videoTitle, tmp); } } } } } } return hashMap; } /** * 清洗整理titile字符串, * * @param info * @return */ private static String getTitle(String info) { int len = info.length(); String title = info.substring(12, len - 1); return title; } }
三、爬取图片
1 import com.obcy.util.DownLoad; 2 import com.obcy.util.GetHTML; 3 import org.jsoup.Jsoup; 4 import org.jsoup.nodes.Document; 5 import org.jsoup.select.Elements; 6 import org.junit.Test; 7 8 import java.io.File; 9 import java.util.ArrayList; 10 import java.util.HashMap; 11 import java.util.Map; 12 13 public class BiAn { 14 15 //获取到所有的一级页面,从第2页到第946页 16 17 18 public ArrayList<String> getTopUrl(){ 19 //String topurl = "http://www.netbian.com/hd3840x2160/index_2.htm" 20 21 //定义一个集合保存所有的一级页面 22 ArrayList<String> list = new ArrayList<String>(); 23 for (int i = 2; i <= 946; i++) { 24 list.add("http://www.netbian.com/hd3840x2160/index_"+i+".htm"); 25 } 26 27 return list; 28 } 29 30 31 //获取一级页面的所有图片查看地址 32 //传入的参数是一级页面地址 33 public HashMap<String,String> getGpjView(String topUrl){ 34 35 String url = topUrl; 36 String html = GetHTML.getHTML(url); 37 //获取到网页源代码document对象 38 Document document = Jsoup.parse(html); 39 //解析document对象,拿到页面每个图片查看地址 40 Elements list = document.getElementsByClass("list"); 41 //拿到的list只有一个对象,这个对象里包含页面所有的图片a标签 42 Elements a = null; 43 try { 44 a = list.get(0).select("ul>li>a"); 45 } catch (Exception e) { 46 System.out.println("没有获取到a标签"); 47 48 } 49 //遍历a标签对象,拿到a标签的href属性值并拼接成完整的图片查看地址放入集合里 50 //建立一个map集合 51 HashMap<String,String> map = new HashMap<String, String>(); 52 for (int i = 0; i < a.size(); i++) { 53 String href = "http://www.netbian.com"+a.get(i).attr("href"); 54 String name = a.get(i).attr("title"); 55 //System.out.println(href);http://www.netbian.com/desk/22138.htm 56 map.put(name,href); 57 } 58 59 //搜集本页面的图片查看地址完成 60 61 return map; 62 63 } 64 65 66 //访问每个一级页面,获取到页面里所有的图片下载地址, 67 // 方法接收一个装有一个一级页面所有图片查看地址的集合 68 69 public void getDownload(HashMap<String,String> map){ 70 71 72 //遍历集合,对集合里所有的页面进行提取,每个页面提取到一张图片下载地址,并下载 73 74 for (Map.Entry<String, String> entry : map.entrySet()) { 75 76 77 String html = GetHTML.getHTML(entry.getValue()); 78 79 Document document = Jsoup.parse(html); 80 81 //拿到图片img标签对象,只有一个 82 83 Elements endpage = null; 84 try { 85 endpage = document.getElementsByClass("endpage").get(0).select("div>p>a>img"); 86 } catch (Exception e) { 87 System.out.println("没获取到页面对象,继续下一个"); 88 continue; 89 } 90 91 //System.out.println(endpage.get(0).attr("src")); 92 93 //得到下载地址 94 String target = endpage.get(0).attr("src"); 95 96 97 String path = "F:/BiAn/"+entry.getKey()+".jpg"; 98 //开始下载 99 DownLoad.downLoad(target,path); 100 } 101 102 103 } 104 105 @Test 106 public void test(){ 107 //判断是否存在文件夹F:/BiAn 108 File file = new File("C:/BiAn"); 109 if (!file.exists()){ 110 file.mkdirs(); 111 System.out.println("已创建下载文件夹F:/BiAn"); 112 }else { 113 System.out.println("已存在文件夹,具备下载条件"); 114 } 115 116 117 //单线程 118 //拿到所有一级页面 119 ArrayList<String> topUrl = getTopUrl(); 120 //对每个页面进行操作,1.得到视图集合,2.遍历集合,拿到下载地址,3.下载 121 122 for (String url : topUrl) { 123 HashMap<String, String> gpjView = getGpjView(url); 124 getDownload(gpjView); 125 } 126 127 128 129 } 130 }
四、如何分析网页信息,从而进行抓取的分析?
https://www.cnblogs.com/518894-lu/p/9021548.html