java爬取携程酒店数据存到本地数据库

话不多说,直接上代码

1:配置依赖

    </dependency>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.8.3</version>
    </dependency>

    <dependency>
      <groupId>org.json</groupId>
      <artifactId>json</artifactId>
      <version>20160810</version>
    </dependency>


    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.47</version>
    </dependency>

    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.47</version>
    </dependency>

    <dependency>
      <groupId>org.mybatis</groupId>
      <artifactId>mybatis</artifactId>
      <version>3.5.1</version>
    </dependency>

2:数据库表bean对象

public class Hotel {
    /**
     * 酒店id,酒店价格,酒店所在城市,酒店名称,酒店图片,酒店经纬度(地图),酒店地址,星级,评分
     */
    private String hid;
    private String price;
    private  String city;
    private String name;
    private String pic;
    private String xpath;
    private String ypath;
    private String addr;
    private String start;
    private String score;

    public String getHid() {
        return hid;
    }

    public void setHid(String hid) {
        this.hid = hid;
    }

    public String getPrice() {
        return price;
    }

    public void setPrice(String price) {
        this.price = price;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getPic() {
        return pic;
    }

    public void setPic(String pic) {
        this.pic = pic;
    }

    public String getXpath() {
        return xpath;
    }

    public void setXpath(String xpath) {
        this.xpath = xpath;
    }

    public String getYpath() {
        return ypath;
    }

    public void setYpath(String ypath) {
        this.ypath = ypath;
    }

    public String getAddr() {
        return addr;
    }

    public void setAddr(String addr) {
        this.addr = addr;
    }

    public String getStart() {
        return start;
    }

    public void setStart(String start) {
        this.start = start;
    }

    public String getScore() {
        return score;
    }

    public void setScore(String score) {
        this.score = score;
    }

    @Override
    public String toString() {
        return "Hotel{" +
                "hid='" + hid + '\'' +
                ", price='" + price + '\'' +
                ", city='" + city + '\'' +
                ", name='" + name + '\'' +
                ", pic='" + pic + '\'' +
                ", xpath='" + xpath + '\'' +
                ", ypath='" + ypath + '\'' +
                ", addr='" + addr + '\'' +
                ", start='" + start + '\'' +
                ", score='" + score + '\'' +
                '}';
    }
}

3:数据库配置

driver=com.mysql.jdbc.Driver
url=jdbc:mysql://localhost:3306/数据库表名
username=***
password=***

4:mybatis配置

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <properties resource="jdbc.properties"></properties>
    <environments default="development">
        <environment id="development">
            <transactionManager type="JDBC"/>
            <dataSource type="POOLED">
                <property name="driver" value="${driver}"/>
                <property name="url" value="${url}"/>
                <property name="username" value="${username}"/>
                <property name="password" value="${password}"/>
            </dataSource>
        </environment>
    </environments>
    <mappers>
        <mapper resource="HotelMapper.xml"/>
    </mappers>
</configuration>

5:dao接口,为插入数据做准备

public interface HotelMapper {
    /**
     * 插入数据
     * @param hotel
     * @return
     */
    public int insert (Hotel hotel);
}

6:做dao类的映射文件

<!DOCTYPE mapper
        PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.yc.dao.HotelMapper">
    <insert id="insert" parameterType="com.yc.model.Hotel">
        insert into hotel(id,hid,name,price,city,xpath,ypath,pic,addr,start,score)
        values (null,#{hid},#{name},#{price},#{city},#{xpath},#{ypath},#{pic},#{addr},#{start},#{score})
    </insert>
</mapper>

7:终极奥义,爬虫代码

public class Test02 {


    public static void main(String[] args) throws IOException, InterruptedException {
        Hotel hotel = new Hotel();
        String resource = "mybatis-config.xml";
        InputStream inputStream = null;
        try {
            //读取配置文件
            inputStream = Resources.getResourceAsStream(resource);
        } catch (IOException e) {
            e.printStackTrace();
        }
        SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(inputStream);
        //注册mybatis 工厂

        //得到连接对象
        SqlSession sqlSession = sqlSessionFactory.openSession(true);
        //从mybatis中得到dao对象
        HotelMapper mapper = sqlSession.getMapper(HotelMapper.class);
        System.out.println(getHotelInfoByCity("guangzhou32"));
        String str = "guangzhou32/p";
        String str2;
        int x=0;
        for (int i = 1; i < 100; i+=2) {
            str2 = str + i;
            Thread.sleep(5000);
            List<List<String>> lists = getHotelInfoByCity(str2);
            for (List<String> list : lists) {
                hotel.setHid(list.get(0));
                hotel.setCity(list.get(1));
                hotel.setPrice(list.get(2));
                hotel.setName(list.get(3));
                hotel.setXpath(list.get(4));
                hotel.setYpath(list.get(5));
                hotel.setPic(list.get(6));
                hotel.setAddr(list.get(7));
                hotel.setScore(list.get(8));
                if (list.get(9)=="" ||list.get(9).isEmpty()){
                    list.set(9,"0");
                }
                hotel.setStart(list.get(9).charAt(list.get(9).length()-1)+"");
                System.out.println(mapper.insert(hotel));
                sqlSession.commit();
            }
            x+=25;
            System.out.println("已爬取"+x+"条数据");
        }

    }

    public static List<List<String>> getHotelInfoByCity(String city) throws IOException {

        //首先输入要爬的网页
        URL url = new URL("https://hotels.ctrip.com/hotel/" + city);

        //建立连接
        URLConnection urlConnection = url.openConnection();
        HttpURLConnection connection = (HttpURLConnection) urlConnection;

        // 建立一个读取流从连接中读取
        BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(),
                "utf-8"));

        String current; // 爬虫爬下来的每一行数据
        // 是否找到关键行1
        boolean hadfindhtllst = false;
        // 是否找到关键行2
        boolean hadfindhotelPositionJSON = false;
        // 内含hotelid,酒店剩余房间数
        char[] htllist = new char[10005];
        // 存放目标char[]->String
        String roomnum = null;
        // 内涵hotelid,酒店全称,酒店地址,酒店精确经纬度
        char[] hotelPositionJSON = new char[50005];
        // 存放目标char[]->String
        String hotelinfo = null;

        while ((current = in.readLine()) != null) {
            // 如果还没找到关键行
            if (hadfindhtllst == false) {
                if (current.length() < 19) {
                    // 如果该行长度小于19 直接跳过
                    continue;
                }
                current.getChars(12, 19, htllist, 0);
                if (charsequals(htllist, "htllist", 7)) {
                    // 找到了关建行
                    hadfindhtllst = true;
                    current.getChars(12, current.length(), htllist, 0);
                    roomnum = new String(htllist, 0, current.length() - 12);
                }
            }
            // 如果还没找到关键行
            if (hadfindhotelPositionJSON == false) {
                if (current.length() < 25) {
                    // 如果改行长度小于25 直接跳过
                    continue;
                }
                current.getChars(8, 25, hotelPositionJSON, 0);
                if (charsequals(hotelPositionJSON, "hotelPositionJSON", 17)) {
                    // 找到了关建行
                    hadfindhotelPositionJSON = true;
                    current.getChars(8, current.length(), hotelPositionJSON, 0);
                    hotelinfo = new String(hotelPositionJSON, 0, current.length() - 8);
                }
            }
        }

        // 经过爬虫后,有用的信息只有 String roomnum(内含酒店id,酒店剩余房间数)
        // String hotelinfo(内含酒店id,酒店全名,酒店位置,酒店经纬度,酒店评分)
        //System.err.println(roomnum+"........");
        //System.err.println(hotelinfo+"--------");

        if (roomnum == null || hotelinfo == null) {
            return null;
        }

        List<List<String>> lists = new ArrayList<List<String>>();

        // 添加酒店id,酒店城市,酒店余房数量
        // cs为"出现的次数
        for (int i = 0, cs = 0, now = 0; i < roomnum.length(); i++) {
            if (roomnum.charAt(i) == '\"') {
                cs++;
                if (cs % 8 == 3) {
                    List<String> list = new ArrayList<String>();
                    String temp = "";
                    while (roomnum.charAt(++i) != '\"') {
                        temp += roomnum.charAt(i);

                    }
                    i--;
                    list.add(temp);
                    if (city.equals("chengdu28")) {
                        list.add("成都");
                    } else if (city.equals("guangzhou32")) {
                        list.add("广州");
                    } else if (city.equals("beijing1")) {
                        list.add("北京");
                    } else {
                        list.add("未知城市");
                    }
                    lists.add(list);
                }
                if (cs % 8 == 7) {
                    String temp = "";
                    while (roomnum.charAt(++i) != '\"') {
                        temp += roomnum.charAt(i);

                    }
                    i--;
                    lists.get(now++).add(temp);
                }
            }
        }

        // 添加酒店名,纬度lat,经度lon,酒店照片,酒店地址,酒店评分,酒店接待能力(星级)
        for (int i = 0, cs = 0, now = 0; i < hotelinfo.length(); i++) {
            if (hotelinfo.charAt(i) == '\"') {
                cs++;
                if (cs % 56 == 7 || cs % 56 == 11 || cs % 56 == 15
                        || cs % 56 == 23 || cs % 56 == 27
                        || cs % 56 == 31 || cs % 56 == 43) {
                    String temp = "";
                    while (hotelinfo.charAt(++i) != '\"') {
                        temp += hotelinfo.charAt(i);
                    }
                    i--;
                    lists.get(now).add(temp);
                }
                if (cs % 56 == 43) {
                    now++;
                }
            }
        }
        return lists;
    }

    // 查询char[]型 与 String型是否相等
    public static boolean charsequals(char[] left, String right, int length) {
        boolean result = true;
        for (int i = length - 1; i >= 0; i--) {
            result &= left[i] == right.charAt(i);
        }
        return result;
    }
}

 

posted @ 2023-01-31 11:55  昨夜风雨声  阅读(140)  评论(0编辑  收藏  举报  来源