仿京东搜索
仿京东搜索
项目介绍:基于springboot的前后端分离项目,利用爬虫将京东首页的数据爬取下来,然后将数据放到ElasticSearch中,通过后端配置查询规则实现仿京东搜索。
功能:实现分页高亮查询
主要负责:
1、Jsoup爬取数据 。2、实现搜索数据(条件、精确、分页、高亮搜索)
项目地址:https://gitee.com/jamer/jingdong-search
依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.2.5.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.renzhe</groupId> <artifactId>jd-search</artifactId> <version>0.0.1-SNAPSHOT</version> <name>jd-search</name> <description>Demo project for Spring Boot</description> <properties> <java.version>1.8</java.version> <elasticsearch.version>7.6.1</elasticsearch.version> </properties> <dependencies> <!--解析电影音乐 tika--> <!--解析网页jsoup--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.50</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-thymeleaf</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <scope>runtime</scope> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-configuration-processor</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> <configuration> <excludes> <exclude> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> </exclude> </excludes> </configuration> </plugin> </plugins> </build> </project>
工具类utils 用来解析网页
package com.renzhe.utils; import com.renzhe.pojo.Content; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.boot.autoconfigure.condition.ConditionalOnJava; import org.springframework.stereotype.Component; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List; //爬取网页 @Component public class HtmlParseUtil { public List<Content> parseJD(String keyword)throws IOException { //获取请求 https://search.jd.com/Search?keyword=Java //前提要联网 ajax不能获取到 String url = "https://search.jd.com/Search?keyword=" + keyword; //解析网页,(Jsoup返回的就是浏览器Document对象) Document document = Jsoup.parse(new URL(url), 30000); //所有你在js中使用的方法这里都能用 Element element = document.getElementById("J_goodsList"); //获取所有的li元素 Elements elements = element.getElementsByTag("li"); //获取元素中的内容,这里的el 就是每一个li标签 ArrayList<Content> goodsList = new ArrayList(); for (Element el : elements) { //图片爬取失败的原因 在图片资源比较多的网站 大多都是将其用懒加载实现,等页面加载完后再渲染页面 从而实现提高加载速度 //source-data-lazy-img String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); String price = el.getElementsByClass("p-price").eq(0).text(); String title = el.getElementsByClass("p-name").eq(0).text(); Content content = new Content(); content.setImg(img); content.setPrice(price); content.setTitle(title); goodsList.add(content); } return goodsList; } }
pojo类
package com.renzhe.pojo; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; import lombok.ToString; @Data @AllArgsConstructor @NoArgsConstructor @ToString public class Content { private String title; private String img; private String price; }
service
package com.renzhe.service; import com.alibaba.fastjson.JSON; import com.renzhe.pojo.Content; import com.renzhe.utils.HtmlParseUtil; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.common.text.Text; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import javax.swing.text.Highlighter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; //业务编写 @Service public class ContentService { @Autowired private RestHighLevelClient restHighLevelClient; //解析数据 放入es索引库中 public Boolean parseContent(String keywords) throws IOException { List<Content> contents = new HtmlParseUtil().parseJD(keywords); //把查询的数据放入到es BulkRequest bulkRequest = new BulkRequest(); bulkRequest.timeout("2m"); for (int i = 0; i <contents.size() ; i++) { bulkRequest.add(new IndexRequest("jd_goods") .source(JSON.toJSONString(contents.get(i)), XContentType.JSON)); } BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT); return !bulk.hasFailures(); } //2、获取这些数据实现搜索功能 public List<Map<String,Object>> searchPage(String keyword,int pageNo,int pageSize) throws IOException{ if(pageNo<=1){ pageNo = 1; } //条件搜索 SearchRequest searchRequest = new SearchRequest("jd_goods"); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); //分页 sourceBuilder.from(pageNo); sourceBuilder.size(pageSize); //精准匹配 TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword); sourceBuilder.query(termQueryBuilder); sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS)); //执行搜索 searchRequest.source(sourceBuilder); SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT); //解析结果 ArrayList<Map<String,Object>> list = new ArrayList<>(); for (SearchHit documentFields : searchResponse.getHits().getHits()) { list.add(documentFields.getSourceAsMap()); } return list; } //3、获取这些数据实现搜索高亮功能 public List<Map<String,Object>> searchPageHighlightBuilder(String keyword,int pageNo,int pageSize) throws IOException{ if(pageNo<=1){ pageNo = 1; } //条件搜索 SearchRequest searchRequest = new SearchRequest("jd_goods"); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); //分页 sourceBuilder.from(pageNo); sourceBuilder.size(pageSize); //精准匹配 TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword); sourceBuilder.query(termQueryBuilder); sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS)); //高亮 HighlightBuilder highlightBuilder = new HighlightBuilder(); highlightBuilder.field("title"); highlightBuilder.requireFieldMatch(false); highlightBuilder.preTags("<span style='color:red'>"); highlightBuilder.postTags("</span>"); sourceBuilder.highlighter(highlightBuilder); //执行搜索 searchRequest.source(sourceBuilder); SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT); //解析结果 ArrayList<Map<String,Object>> list = new ArrayList<>(); for (SearchHit documentFields : searchResponse.getHits().getHits()) { //解析高亮的字段 Map<String, HighlightField> highlightFields = documentFields.getHighlightFields(); HighlightField title = highlightFields.get("title"); Map<String,Object> sourceAsMap = documentFields.getSourceAsMap();//原来的结果 if(title!=null){ Text[] fragments = title.fragments(); String newTitle = " "; for (Text text : fragments) { newTitle += text; } sourceAsMap.put("title",newTitle);//使用高亮的资源替换掉原来的内容 } list.add(sourceAsMap); } return list; } }
config
package com.renzhe.config; import org.apache.http.HttpHost; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestHighLevelClient; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @Configuration public class ElasticSearchClientConfig { @Bean public RestHighLevelClient restHighLevelClient(){ RestHighLevelClient client = new RestHighLevelClient( RestClient.builder( new HttpHost("127.0.0.1", 9200, "http"))); return client; } }
controller
package com.renzhe.controller; import com.renzhe.service.ContentService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RestController; import java.io.IOException; import java.util.List; import java.util.Map; //前端的请求编写 @RestController public class ContentController { @Autowired private ContentService contentService; @GetMapping("/parse/{keywords}") public Boolean parse(@PathVariable("keywords") String keywords) throws IOException { Boolean flag = contentService.parseContent(keywords); System.out.println(flag); return flag; } @GetMapping("/search/{keyword}/{pageNo}/{pageSize}") public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize") int pageSize) throws Exception{ return contentService.searchPageHighlightBuilder(keyword,pageNo,pageSize); } }
application.properties
server.port=9090 #关闭thymleaf的缓存 spring.thymeleaf.cache=false