java爬取携程酒店数据存到本地数据库
话不多说,直接上代码
1:配置依赖
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20160810</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.5.1</version>
</dependency>
2:数据库表bean对象
public class Hotel {
/**
* 酒店id,酒店价格,酒店所在城市,酒店名称,酒店图片,酒店经纬度(地图),酒店地址,星级,评分
*/
private String hid;
private String price;
private String city;
private String name;
private String pic;
private String xpath;
private String ypath;
private String addr;
private String start;
private String score;
public String getHid() {
return hid;
}
public void setHid(String hid) {
this.hid = hid;
}
public String getPrice() {
return price;
}
public void setPrice(String price) {
this.price = price;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPic() {
return pic;
}
public void setPic(String pic) {
this.pic = pic;
}
public String getXpath() {
return xpath;
}
public void setXpath(String xpath) {
this.xpath = xpath;
}
public String getYpath() {
return ypath;
}
public void setYpath(String ypath) {
this.ypath = ypath;
}
public String getAddr() {
return addr;
}
public void setAddr(String addr) {
this.addr = addr;
}
public String getStart() {
return start;
}
public void setStart(String start) {
this.start = start;
}
public String getScore() {
return score;
}
public void setScore(String score) {
this.score = score;
}
@Override
public String toString() {
return "Hotel{" +
"hid='" + hid + '\'' +
", price='" + price + '\'' +
", city='" + city + '\'' +
", name='" + name + '\'' +
", pic='" + pic + '\'' +
", xpath='" + xpath + '\'' +
", ypath='" + ypath + '\'' +
", addr='" + addr + '\'' +
", start='" + start + '\'' +
", score='" + score + '\'' +
'}';
}
}
3:数据库配置
driver=com.mysql.jdbc.Driver
url=jdbc:mysql://localhost:3306/数据库表名
username=***
password=***
4:mybatis配置
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<properties resource="jdbc.properties"></properties>
<environments default="development">
<environment id="development">
<transactionManager type="JDBC"/>
<dataSource type="POOLED">
<property name="driver" value="${driver}"/>
<property name="url" value="${url}"/>
<property name="username" value="${username}"/>
<property name="password" value="${password}"/>
</dataSource>
</environment>
</environments>
<mappers>
<mapper resource="HotelMapper.xml"/>
</mappers>
</configuration>
5:dao接口,为插入数据做准备
public interface HotelMapper {
/**
* 插入数据
* @param hotel
* @return
*/
public int insert (Hotel hotel);
}
6:做dao类的映射文件
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.yc.dao.HotelMapper">
<insert id="insert" parameterType="com.yc.model.Hotel">
insert into hotel(id,hid,name,price,city,xpath,ypath,pic,addr,start,score)
values (null,#{hid},#{name},#{price},#{city},#{xpath},#{ypath},#{pic},#{addr},#{start},#{score})
</insert>
</mapper>
7:终极奥义,爬虫代码
public class Test02 {
public static void main(String[] args) throws IOException, InterruptedException {
Hotel hotel = new Hotel();
String resource = "mybatis-config.xml";
InputStream inputStream = null;
try {
//读取配置文件
inputStream = Resources.getResourceAsStream(resource);
} catch (IOException e) {
e.printStackTrace();
}
SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream);
//注册mybatis 工厂
//得到连接对象
SqlSession sqlSession = sqlSessionFactory.openSession(true);
//从mybatis中得到dao对象
HotelMapper mapper = sqlSession.getMapper(HotelMapper.class);
System.out.println(getHotelInfoByCity("guangzhou32"));
String str = "guangzhou32/p";
String str2;
int x=0;
for (int i = 1; i < 100; i+=2) {
str2 = str + i;
Thread.sleep(5000);
List<List<String>> lists = getHotelInfoByCity(str2);
for (List<String> list : lists) {
hotel.setHid(list.get(0));
hotel.setCity(list.get(1));
hotel.setPrice(list.get(2));
hotel.setName(list.get(3));
hotel.setXpath(list.get(4));
hotel.setYpath(list.get(5));
hotel.setPic(list.get(6));
hotel.setAddr(list.get(7));
hotel.setScore(list.get(8));
if (list.get(9)=="" ||list.get(9).isEmpty()){
list.set(9,"0");
}
hotel.setStart(list.get(9).charAt(list.get(9).length()-1)+"");
System.out.println(mapper.insert(hotel));
sqlSession.commit();
}
x+=25;
System.out.println("已爬取"+x+"条数据");
}
}
public static List<List<String>> getHotelInfoByCity(String city) throws IOException {
//首先输入要爬的网页
URL url = new URL("https://hotels.ctrip.com/hotel/" + city);
//建立连接
URLConnection urlConnection = url.openConnection();
HttpURLConnection connection = (HttpURLConnection) urlConnection;
// 建立一个读取流从连接中读取
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(),
"utf-8"));
String current; // 爬虫爬下来的每一行数据
// 是否找到关键行1
boolean hadfindhtllst = false;
// 是否找到关键行2
boolean hadfindhotelPositionJSON = false;
// 内含hotelid,酒店剩余房间数
char[] htllist = new char[10005];
// 存放目标char[]->String
String roomnum = null;
// 内涵hotelid,酒店全称,酒店地址,酒店精确经纬度
char[] hotelPositionJSON = new char[50005];
// 存放目标char[]->String
String hotelinfo = null;
while ((current = in.readLine()) != null) {
// 如果还没找到关键行
if (hadfindhtllst == false) {
if (current.length() < 19) {
// 如果该行长度小于19 直接跳过
continue;
}
current.getChars(12, 19, htllist, 0);
if (charsequals(htllist, "htllist", 7)) {
// 找到了关建行
hadfindhtllst = true;
current.getChars(12, current.length(), htllist, 0);
roomnum = new String(htllist, 0, current.length() - 12);
}
}
// 如果还没找到关键行
if (hadfindhotelPositionJSON == false) {
if (current.length() < 25) {
// 如果改行长度小于25 直接跳过
continue;
}
current.getChars(8, 25, hotelPositionJSON, 0);
if (charsequals(hotelPositionJSON, "hotelPositionJSON", 17)) {
// 找到了关建行
hadfindhotelPositionJSON = true;
current.getChars(8, current.length(), hotelPositionJSON, 0);
hotelinfo = new String(hotelPositionJSON, 0, current.length() - 8);
}
}
}
// 经过爬虫后,有用的信息只有 String roomnum(内含酒店id,酒店剩余房间数)
// String hotelinfo(内含酒店id,酒店全名,酒店位置,酒店经纬度,酒店评分)
//System.err.println(roomnum+"........");
//System.err.println(hotelinfo+"--------");
if (roomnum == null || hotelinfo == null) {
return null;
}
List<List<String>> lists = new ArrayList<List<String>>();
// 添加酒店id,酒店城市,酒店余房数量
// cs为"出现的次数
for (int i = 0, cs = 0, now = 0; i < roomnum.length(); i++) {
if (roomnum.charAt(i) == '\"') {
cs++;
if (cs % 8 == 3) {
List<String> list = new ArrayList<String>();
String temp = "";
while (roomnum.charAt(++i) != '\"') {
temp += roomnum.charAt(i);
}
i--;
list.add(temp);
if (city.equals("chengdu28")) {
list.add("成都");
} else if (city.equals("guangzhou32")) {
list.add("广州");
} else if (city.equals("beijing1")) {
list.add("北京");
} else {
list.add("未知城市");
}
lists.add(list);
}
if (cs % 8 == 7) {
String temp = "";
while (roomnum.charAt(++i) != '\"') {
temp += roomnum.charAt(i);
}
i--;
lists.get(now++).add(temp);
}
}
}
// 添加酒店名,纬度lat,经度lon,酒店照片,酒店地址,酒店评分,酒店接待能力(星级)
for (int i = 0, cs = 0, now = 0; i < hotelinfo.length(); i++) {
if (hotelinfo.charAt(i) == '\"') {
cs++;
if (cs % 56 == 7 || cs % 56 == 11 || cs % 56 == 15
|| cs % 56 == 23 || cs % 56 == 27
|| cs % 56 == 31 || cs % 56 == 43) {
String temp = "";
while (hotelinfo.charAt(++i) != '\"') {
temp += hotelinfo.charAt(i);
}
i--;
lists.get(now).add(temp);
}
if (cs % 56 == 43) {
now++;
}
}
}
return lists;
}
// 查询char[]型 与 String型是否相等
public static boolean charsequals(char[] left, String right, int length) {
boolean result = true;
for (int i = length - 1; i >= 0; i--) {
result &= left[i] == right.charAt(i);
}
return result;
}
}