GetPageInfo 获取数据、存入本地、从本地读取数据
import lombok.SneakyThrows;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.*;
public class GetPageInfo {
public static void main(String[] args) throws Exception {
// 获取网页数据并保存到本地
String casData = ""; // 创建一个StringBuilder用于存储爬取到并处理过的数据,存入本地
int count = 0;
Integer pages = 10; // 爬取的页数
for(int i = 1;i <= pages;i ++){
String url = "https://www.xxx.com.cn/list/?p=" + (i + "");
// url = url.replace("pageNum",i + "");
String cas = getCas(url); // 调用爬取方法传入url
System.out.println("第" + i+"页数据获取成功");
casData += cas + " "; // 字符串追加 每次追加完成后加个空格
// 每10页写入一次,提高效率
if(i % 10 == 0){
if(count == 0){
//将文件写入本地
writeOcrStrtoFile(casData,"F:\\nistCasData","cas.txt");
System.out.println("第" + i/10 +"次保存成功");
casData = "";
count ++;
}else{
String tempRead = readFileByLines("F:\\nistCasData\\cas.txt");
tempRead += casData;
writeOcrStrtoFile(tempRead,"F:\\nistCasData","cas.txt");
System.out.println("第" + i/10 +"次保存成功");
casData = "";
}
}
}
// 将剩下i % 10 != 0的数据写入
// 读取本地文件
String readData = readFileByLines("F:\\nistCasData\\cas.txt");
readData += casData;
writeOcrStrtoFile(readData,"F:\\nistCasData","cas.txt");
// String[] arr = readData.split("\\s+"); // 分割一个或者多个空格
// for(int i = 0;i < arr.length;i ++){
// System.out.println(i + ":" + arr[i]);
// }
}
/**
* 获取网页数据
* @param url
*/
@SneakyThrows
public static String getCas(String url){
// 如果报错,忽略url的https证书;http开头的应该可以不用处理
HttpsUrlValidator.retrieveResponseFromServer(url);
// 加入url并编写请求头,打开浏览器控制台照着写
Connection.Response response = Jsoup
.connect(url)
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
.header("Accept-Encoding","*/*")
.header("Accept-Language","zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja-JP;q=0.6,ja;q=0.5,ko-KR;q=0.4,ko;q=0.3")
.header("Connection","keep-alive")
.header("Content-Type","application/json;charset=UTF-8")
.header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36")
.timeout(10000) // 设置超时时间
.ignoreContentType(true)
.execute();
String html = response.body(); // 获取到的html字符串
Document doc = Jsoup.parse(html); // 使用jsoup 进行语言转换
// System.out.println(doc.select(".alink").size()); // 查看class="alink"的个数
String cas = doc.select(".alink").text(); // 获取class="alink"的数据
return cas;
}
/**
* 保存文件到本地
* @param result 需要写入的数据
* @param outPath 保存的路径
* @param outFileName 保存的文件名
* @throws Exception
*/
public static void writeOcrStrtoFile(String result, String outPath, String outFileName) throws Exception {
File dir = new File(outPath);
if(!dir.exists()) {
dir.mkdirs();
}
File txt = new File(outPath + "/" + outFileName);
// 先删除;否则会直接追加在之前的内容后面,成几何倍数增长
if (txt.isFile() && txt.exists()) {
txt.delete();
}
// 再创建
if (!txt.exists()) {
txt.createNewFile();
}
byte bytes[] = new byte[512];
bytes = result.getBytes();
int b = bytes.length; // 是字节的长度,不是字符串的长度
FileOutputStream fos = new FileOutputStream(txt);
fos.write(bytes);
fos.flush();
fos.close();
}
/**
* 读取本地文件(按行读取),因为存的时候没换行,所以按行读取
* @param fileName 文件名
*/
public static String readFileByLines(String fileName) {
File file = new File(fileName);
String readData = "";
BufferedReader reader = null;
try {
String tempString = null;
reader = new BufferedReader(new FileReader(file));
// 一次读一行,读入null时文件结束
while ((tempString = reader.readLine()) != null) {
readData += tempString;
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
return readData;
}
}
忽略https证书(http应该不需要,没试过)
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;
public class HttpsUrlValidator {
static HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. "
+ session.getPeerHost());
return true;
}
};
public final static String retrieveResponseFromServer(final String url) {
HttpURLConnection connection = null;
try {
URL validationUrl = new URL(url);
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
connection = (HttpURLConnection) validationUrl.openConnection();
final BufferedReader in = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
String line;
final StringBuffer stringBuffer = new StringBuffer(255);
synchronized (stringBuffer) {
while ((line = in.readLine()) != null) {
stringBuffer.append(line);
stringBuffer.append("\n");
}
return stringBuffer.toString();
}
} catch (final IOException e) {
System.out.println(e.getMessage());
return null;
} catch (final Exception e1){
System.out.println(e1.getMessage());
return null;
}finally {
if (connection != null) {
connection.disconnect();
}
}
}
public static void trustAllHttpsCertificates() throws Exception {
javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
javax.net.ssl.TrustManager tm = new miTM();
trustAllCerts[0] = tm;
javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
.getInstance("SSL");
sc.init(null, trustAllCerts, null);
javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
.getSocketFactory());
}
static class miTM implements javax.net.ssl.TrustManager,
javax.net.ssl.X509TrustManager {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(
java.security.cert.X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
public void checkClientTrusted(
java.security.cert.X509Certificate[] certs, String authType)
throws java.security.cert.CertificateException {
return;
}
}
}