Flink之热门页面统计

1、数据格式

83.149.9.123 - - 17/05/2020:10:05:03 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-search.png
83.149.9.123 - - 17/05/2020:10:05:43 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard3.png
83.149.9.123 - - 17/05/2020:10:05:47 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/highlight/highlight.js
83.149.9.123 - - 17/05/2020:10:05:12 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/zoom-js/zoom.js
83.149.9.123 - - 17/05/2020:10:05:07 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/notes/notes.js
83.149.9.123 - - 17/05/2020:10:05:34 +0000 GET /presentations/logstash-kafkamonitor-2020/images/sad-medic.png
83.149.9.123 - - 17/05/2020:10:05:57 +0000 GET /presentations/logstash-kafkamonitor-2020/css/fonts/Roboto-Bold.ttf
83.149.9.123 - - 17/05/2020:10:05:50 +0000 GET /presentations/logstash-kafkamonitor-2020/css/fonts/Roboto-Regular.ttf
83.149.9.123 - - 17/05/2020:10:05:24 +0000 GET /presentations/logstash-kafkamonitor-2020/images/frontend-response-codes.png
83.149.9.123 - - 17/05/2020:10:05:50 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard.png
83.149.9.123 - - 17/05/2020:10:05:46 +0000 GET /presentations/logstash-kafkamonitor-2020/images/Dreamhost_logo.svg
83.149.9.123 - - 17/05/2020:10:05:11 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard2.png
83.149.9.123 - - 17/05/2020:10:05:19 +0000 GET /presentations/logstash-kafkamonitor-2020/images/apache-icon.gif
83.149.9.123 - - 17/05/2020:10:05:33 +0000 GET /presentations/logstash-kafkamonitor-2020/images/nagios-sms5.png
83.149.9.123 - - 17/05/2020:10:05:00 +0000 GET /presentations/logstash-kafkamonitor-2020/images/redis.png
83.149.9.123 - - 17/05/2020:10:05:25 +0000 GET /presentations/logstash-kafkamonitor-2020/images/elasticsearch.png
83.149.9.123 - - 17/05/2020:10:05:59 +0000 GET /presentations/logstash-kafkamonitor-2020/images/logstashbook.png
83.149.9.123 - - 17/05/2020:10:05:30 +0000 GET /presentations/logstash-kafkamonitor-2020/images/github-contributions.png
83.149.9.123 - - 17/05/2020:10:05:53 +0000 GET /presentations/logstash-kafkamonitor-2020/css/print/paper.css
83.149.9.123 - - 17/05/2020:10:05:24 +0000 GET /presentations/logstash-kafkamonitor-2020/images/1983_delorean_dmc-12-pic-38289.jpeg
83.149.9.123 - - 17/05/2020:10:05:54 +0000 GET /presentations/logstash-kafkamonitor-2020/images/simple-inputs-filters-outputs.jpg
83.149.9.123 - - 17/05/2020:10:05:33 +0000 GET /presentations/logstash-kafkamonitor-2020/images/tiered-outputs-to-inputs.jpg
83.149.9.123 - - 17/05/2020:10:05:56 +0000 GET /favicon.ico
24.236.252.67 - - 17/05/2020:10:05:40 +0000 GET /favicon.ico
93.114.45.13 - - 17/05/2020:10:05:14 +0000 GET /articles/dynamic-dns-with-dhcp/
93.114.45.13 - - 17/05/2020:10:05:04 +0000 GET /reset.css
93.114.45.13 - - 17/05/2020:10:05:45 +0000 GET /style2.css
93.114.45.13 - - 17/05/2020:10:05:14 +0000 GET /favicon.ico
93.114.45.13 - - 17/05/2020:10:05:17 +0000 GET /images/jordan-80.png
93.114.45.13 - - 17/05/2020:10:05:21 +0000 GET /images/web/2009/banner.png
66.249.73.135 - - 17/05/2020:10:05:40 +0000 GET /blog/tags/ipv6
50.16.19.13 - - 17/05/2020:10:05:10 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.185 - - 17/05/2020:10:05:37 +0000 GET /
110.136.166.128 - - 17/05/2020:10:05:35 +0000 GET /projects/xdotool/
46.105.14.53 - - 17/05/2020:10:05:03 +0000 GET /blog/tags/puppet?flav=rss20
110.136.166.128 - - 17/05/2020:10:05:06 +0000 GET /reset.css
110.136.166.128 - - 17/05/2020:10:05:03 +0000 GET /style2.css
110.136.166.128 - - 17/05/2020:10:05:41 +0000 GET /favicon.ico
110.136.166.128 - - 17/05/2020:10:05:32 +0000 GET /images/jordan-80.png
123.125.71.35 - - 17/05/2020:10:05:46 +0000 GET /blog/tags/release
110.136.166.128 - - 17/05/2020:10:05:08 +0000 GET /images/web/2009/banner.png
50.150.204.184 - - 17/05/2020:10:05:46 +0000 GET /images/googledotcom.png
207.241.237.225 - - 17/05/2020:10:05:58 +0000 GET /blog/tags/examples
200.49.190.101 - - 17/05/2020:10:05:36 +0000 GET /reset.css
200.49.190.100 - - 17/05/2020:10:05:38 +0000 GET /blog/tags/web
200.49.190.101 - - 17/05/2020:10:05:11 +0000 GET /style2.css
200.49.190.101 - - 17/05/2020:10:05:37 +0000 GET /images/jordan-80.png
66.249.73.185 - - 17/05/2020:10:05:00 +0000 GET /reset.css
66.249.73.135 - - 17/05/2020:10:05:16 +0000 GET /blog/tags/munin
66.249.73.135 - - 17/05/2020:10:05:33 +0000 GET /blog/tags/firefox?flav=rss20
66.249.73.135 - - 17/05/2020:10:05:17 +0000 GET /blog/geekery/eventdb-ideas.html
67.214.178.190 - - 17/05/2020:10:05:48 +0000 GET /
67.214.178.190 - - 17/05/2020:10:05:18 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
207.241.237.220 - - 17/05/2020:10:05:28 +0000 GET /blog/tags/projects
46.105.14.53 - - 17/05/2020:10:05:44 +0000 GET /blog/tags/puppet?flav=rss20
207.241.237.227 - - 17/05/2020:10:05:47 +0000 GET /blog/geekery/soekris-gpio.html
91.177.205.119 - - 17/05/2020:10:05:22 +0000 GET /blog/geekery/xvfb-firefox.html
91.177.205.119 - - 17/05/2020:10:05:34 +0000 GET /reset.css
91.177.205.119 - - 17/05/2020:10:05:37 +0000 GET /style2.css
91.177.205.119 - - 17/05/2020:10:05:54 +0000 GET /images/jordan-80.png
91.177.205.119 - - 17/05/2020:10:05:31 +0000 GET /images/web/2009/banner.png
91.177.205.119 - - 17/05/2020:10:05:32 +0000 GET /favicon.ico
66.249.73.185 - - 17/05/2020:10:05:22 +0000 GET /doc/index.html?org/elasticsearch/action/search/SearchResponse.html
207.241.237.228 - - 17/05/2020:10:05:40 +0000 GET /blog/tags/defcon
207.241.237.101 - - 17/05/2020:10:05:51 +0000 GET /blog/tags/regex
87.169.99.232 - - 17/05/2020:10:05:59 +0000 GET /presentations/puppet-at-loggly/puppet-at-loggly.pdf.html
209.85.238.199 - - 17/05/2020:10:05:30 +0000 GET /blog/tags/firefox?flav=rss20
209.85.238.199 - - 17/05/2020:10:05:15 +0000 GET /test.xml
81.220.24.207 - - 17/05/2020:10:05:13 +0000 GET /blog/geekery/ssl-latency.html
81.220.24.207 - - 17/05/2020:10:05:44 +0000 GET /reset.css
81.220.24.207 - - 17/05/2020:10:05:26 +0000 GET /images/jordan-80.png
81.220.24.207 - - 17/05/2020:10:05:39 +0000 GET /style2.css
81.220.24.207 - - 17/05/2020:10:05:52 +0000 GET /images/web/2009/banner.png
81.220.24.207 - - 17/05/2020:10:05:21 +0000 GET /favicon.ico
66.249.73.135 - - 17/05/2020:11:05:17 +0000 GET /blog/geekery/vmware-cpu-performance.html
46.105.14.53 - - 17/05/2020:11:05:42 +0000 GET /blog/tags/puppet?flav=rss20
218.30.103.62 - - 17/05/2020:11:05:11 +0000 GET /robots.txt
218.30.103.62 - - 17/05/2020:11:05:46 +0000 GET /robots.txt
218.30.103.62 - - 17/05/2020:11:05:45 +0000 GET /projects/fex/
74.125.40.20 - - 17/05/2020:11:05:59 +0000 GET /?flav=rss20
71.212.224.97 - - 17/05/2020:11:05:05 +0000 GET /projects/xdotool/
71.212.224.97 - - 17/05/2020:11:05:15 +0000 GET /reset.css
71.212.224.97 - - 17/05/2020:11:05:22 +0000 GET /style2.css
71.212.224.97 - - 17/05/2020:11:05:11 +0000 GET /images/jordan-80.png
71.212.224.97 - - 17/05/2020:11:05:28 +0000 GET /images/web/2009/banner.png
218.30.103.62 - - 17/05/2020:11:05:17 +0000 GET /projects/xdotool/xdotool.xhtml
108.174.55.234 - - 17/05/2020:11:05:26 +0000 GET /?flav=rss20
218.30.103.62 - - 17/05/2020:11:05:37 +0000 GET /blog/geekery/c-vs-python-bdb.html
121.107.188.202 - - 17/05/2020:11:05:09 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard3.png
218.30.103.62 - - 17/05/2020:11:05:39 +0000 GET /blog/productivity/better-zsh-xterm-title-fix.html
218.30.103.62 - - 17/05/2020:11:05:11 +0000 GET /blog/geekery/xvfb-firefox.html
218.30.103.62 - - 17/05/2020:11:05:00 +0000 GET /blog/geekery/puppet-facts-into-mcollective.html
198.46.149.143 - - 17/05/2020:11:05:10 +0000 GET /blog/geekery/disabling-battery-in-ubuntu-vms.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
198.46.149.143 - - 17/05/2020:11:05:48 +0000 GET /blog/geekery/solving-good-or-bad-problems.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
218.30.103.62 - - 17/05/2020:11:05:28 +0000 GET /blog/geekery/jquery-interface-puffer.html%20target=
218.30.103.62 - - 17/05/2020:11:05:05 +0000 GET /blog/geekery/ec2-reserved-vs-ondemand.html
66.249.73.135 - - 17/05/2020:11:05:31 +0000 GET /blog/web/firefox-scrolling-fix.html
86.1.76.62 - - 17/05/2020:11:05:36 +0000 GET /projects/xdotool/
86.1.76.62 - - 17/05/2020:11:05:25 +0000 GET /reset.css
86.1.76.62 - - 17/05/2020:11:05:19 +0000 GET /style2.css
86.1.76.62 - - 17/05/2020:11:05:03 +0000 GET /favicon.ico
86.1.76.62 - - 17/05/2020:11:05:28 +0000 GET /images/jordan-80.png
86.1.76.62 - - 17/05/2020:11:05:07 +0000 GET /images/web/2009/banner.png
66.249.73.135 - - 17/05/2020:11:05:58 +0000 GET /blog/tags/bdb
107.170.41.69 - - 17/05/2020:11:05:31 +0000 GET /?flav=atom
50.16.19.13 - - 17/05/2020:11:05:14 +0000 GET /blog/tags/puppet?flav=rss20
46.105.14.53 - - 17/05/2020:11:05:02 +0000 GET /blog/tags/puppet?flav=rss20
208.115.111.72 - - 17/05/2020:11:05:26 +0000 GET /blog/rants/fedora-yum.html
208.115.111.72 - - 17/05/2020:11:05:32 +0000 GET /blog/tags/grok
208.115.111.72 - - 17/05/2020:11:05:12 +0000 GET /blog/tags/is%20it%20done%20yet
208.115.111.72 - - 17/05/2020:11:05:07 +0000 GET /blog/tags/statistics
50.180.79.170 - - 17/05/2020:11:05:50 +0000 GET /favicon.ico
208.115.111.72 - - 17/05/2020:11:05:05 +0000 GET /blog/tags/subversion
208.115.111.72 - - 17/05/2020:11:05:52 +0000 GET /blog/web/194.html
208.115.111.72 - - 17/05/2020:11:05:23 +0000 GET /files/blogposts/20070901/?C=D;O=A
208.115.111.72 - - 17/05/2020:11:05:15 +0000 GET /files/blogposts/20080109/boost_xpressive_test.cpp
208.115.111.72 - - 17/05/2020:11:05:38 +0000 GET /files/blogposts/20090520/
208.115.111.72 - - 17/05/2020:11:05:41 +0000 GET /files/fastsplit/?C=M;O=D
208.115.111.72 - - 17/05/2020:11:05:19 +0000 GET /files/xdotool/docs/man/?C=M;O=D
208.115.111.72 - - 17/05/2020:11:05:16 +0000 GET /scripts/python/wrap/?C=N;O=D
208.115.111.72 - - 17/05/2020:11:05:32 +0000 GET /files/images/?C=S;O=D
208.115.111.72 - - 17/05/2020:11:05:00 +0000 GET /files/blogposts/20080611/
208.115.111.72 - - 17/05/2020:11:05:16 +0000 GET /files/logstash/?C=D;O=D
208.115.111.72 - - 17/05/2020:11:05:53 +0000 GET /presentations/hackday06/
208.115.111.72 - - 17/05/2020:11:05:29 +0000 GET /scripts/grok-py-test/
208.115.111.72 - - 17/05/2020:11:05:08 +0000 GET /?N=A&page=21
208.115.111.72 - - 17/05/2020:11:05:49 +0000 GET /blog/geekery/oniguruma-named-capture-example.html?commentlimit=0
208.115.111.72 - - 17/05/2020:11:05:01 +0000 GET /blog/geekery/ssh-key-invalid-hack.html?commentlimit=0
208.115.111.72 - - 17/05/2020:11:05:31 +0000 GET /blog/geekery/server-side-javascript.html
208.115.111.72 - - 17/05/2020:11:05:15 +0000 GET /blog/geekery/yahoo-hackday-08.html
105.235.130.196 - - 17/05/2020:11:05:01 +0000 GET /images/googledotcom.png
174.37.205.76 - - 17/05/2020:11:05:19 +0000 GET /blog
54.255.13.204 - - 17/05/2020:11:05:03 +0000 GET /articles/ssh-security/
105.235.130.196 - - 17/05/2020:11:05:45 +0000 GET /blog/tags/X11
54.255.13.204 - - 17/05/2020:11:05:55 +0000 GET /reset.css
54.255.13.204 - - 17/05/2020:11:05:32 +0000 GET /style2.css
54.255.13.204 - - 17/05/2020:11:05:10 +0000 GET /favicon.ico
105.235.130.196 - - 17/05/2020:11:05:20 +0000 GET /reset.css
54.255.13.204 - - 17/05/2020:11:05:46 +0000 GET /images/jordan-80.png
54.255.13.204 - - 17/05/2020:11:05:17 +0000 GET /images/web/2009/banner.png
105.235.130.196 - - 17/05/2020:11:05:47 +0000 GET /style2.css
105.235.130.196 - - 17/05/2020:11:05:37 +0000 GET /images/jordan-80.png
105.235.130.196 - - 17/05/2020:11:05:22 +0000 GET /images/web/2009/banner.png
134.76.249.10 - - 17/05/2020:11:05:01 +0000 GET /projects/xdotool/
134.76.249.10 - - 17/05/2020:11:05:09 +0000 GET /reset.css
134.76.249.10 - - 17/05/2020:11:05:57 +0000 GET /style2.css
134.76.249.10 - - 17/05/2020:11:05:23 +0000 GET /favicon.ico
134.76.249.10 - - 17/05/2020:11:05:40 +0000 GET /images/jordan-80.png
134.76.249.10 - - 17/05/2020:11:05:50 +0000 GET /images/web/2009/banner.png
134.76.249.10 - - 17/05/2020:11:05:47 +0000 GET /projects/xdotool
134.76.249.10 - - 17/05/2020:11:05:13 +0000 GET /projects/xdotool/
66.249.73.135 - - 17/05/2020:11:05:26 +0000 GET /?flav=atom
207.241.237.220 - - 17/05/2020:11:05:24 +0000 GET /blog/tags/C?page=2
68.184.202.186 - - 17/05/2020:11:05:28 +0000 GET /projects/xpathtool/
68.184.202.186 - - 17/05/2020:11:05:02 +0000 GET /reset.css
68.184.202.186 - - 17/05/2020:11:05:05 +0000 GET /images/jordan-80.png
68.184.202.186 - - 17/05/2020:11:05:02 +0000 GET /style2.css
68.184.202.186 - - 17/05/2020:11:05:37 +0000 GET /images/web/2009/banner.png
68.184.202.186 - - 17/05/2020:11:05:58 +0000 GET /favicon.ico
46.105.14.53 - - 17/05/2020:11:05:29 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.135 - - 17/05/2020:11:05:00 +0000 GET /?flav=rss20
24.233.162.179 - - 17/05/2020:11:05:31 +0000 GET /favicon.ico
123.125.71.117 - - 17/05/2020:11:05:16 +0000 GET /
220.181.108.153 - - 17/05/2020:11:05:09 +0000 GET /
65.19.138.34 - - 17/05/2020:11:05:40 +0000 GET /
66.249.73.135 - - 17/05/2020:11:05:32 +0000 GET /blog/geekery/rhapsody-on-linux.html
97.116.185.190 - - 17/05/2020:11:05:59 +0000 GET /articles/dynamic-dns-with-dhcp/
97.116.185.190 - - 17/05/2020:11:05:39 +0000 GET /reset.css
97.116.185.190 - - 17/05/2020:11:05:29 +0000 GET /style2.css
97.116.185.190 - - 17/05/2020:11:05:39 +0000 GET /images/jordan-80.png
97.116.185.190 - - 17/05/2020:11:05:02 +0000 GET /images/web/2009/banner.png
97.116.185.190 - - 17/05/2020:11:05:35 +0000 GET /favicon.ico
5.255.72.168 - - 17/05/2020:11:05:21 +0000 GET /
5.255.72.168 - - 17/05/2020:11:05:08 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
46.105.14.53 - - 17/05/2020:11:05:33 +0000 GET /blog/tags/puppet?flav=rss20
5.102.173.71 - - 17/05/2020:11:05:13 +0000 GET /robots.txt
5.102.173.71 - - 17/05/2020:11:05:06 +0000 GET /projects/xdotool/
208.91.156.11 - - 17/05/2020:11:05:05 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar
66.249.73.185 - - 17/05/2020:11:05:58 +0000 GET /presentations/logstash-1/
74.125.176.81 - - 17/05/2020:11:05:28 +0000 GET /?flav=rss20
66.249.73.135 - - 17/05/2020:11:05:14 +0000 GET /blog/geekery/xdotool-2.20110530.html
187.45.193.158 - - 17/05/2020:11:05:54 +0000 GET /presentations/logstash-1/file/about-me/tequila-face.jpg
90.220.199.149 - - 17/05/2020:11:05:18 +0000 GET /blog/geekery/puppet-manage-homedirectory-contents.html
90.220.199.149 - - 17/05/2020:11:05:24 +0000 GET /reset.css
90.220.199.149 - - 17/05/2020:11:05:50 +0000 GET /style2.css
90.220.199.149 - - 17/05/2020:12:05:37 +0000 GET /images/jordan-80.png
90.220.199.149 - - 17/05/2020:12:05:21 +0000 GET /images/web/2009/banner.png
90.220.199.149 - - 17/05/2020:12:05:17 +0000 GET /favicon.ico
36.38.8.174 - - 17/05/2020:12:05:24 +0000 GET /blog/geekery/ssl-latency.html
36.38.8.174 - - 17/05/2020:12:05:36 +0000 GET /reset.css
36.38.8.174 - - 17/05/2020:12:05:14 +0000 GET /style2.css
36.38.8.174 - - 17/05/2020:12:05:44 +0000 GET /images/jordan-80.png
36.38.8.174 - - 17/05/2020:12:05:17 +0000 GET /images/web/2009/banner.png
36.38.8.174 - - 17/05/2020:12:05:39 +0000 GET /favicon.ico
71.207.12.53 - - 17/05/2020:12:05:17 +0000 GET /favicon.ico
220.241.45.142 - - 17/05/2020:12:05:07 +0000 GET /robots.txt
220.241.45.142 - - 17/05/2020:12:05:30 +0000 GET /projects/firefox-tabsearch/
209.85.238.199 - - 17/05/2020:12:05:21 +0000 GET /?flav=atom
46.105.14.53 - - 17/05/2020:12:05:53 +0000 GET /blog/tags/puppet?flav=rss20
66.249.73.135 - - 17/05/2020:12:05:28 +0000 GET /blog/tags/noise
View Code

2、处理主类

package service

/**
 * @program: demo
 * @description: ${description}
 * @author: yang
 * @create: 2020-12-30 14:28
 */

import java.sql.Timestamp

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{MapState, MapStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, KeyedProcessFunction}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import utils.Utils

import scala.collection.mutable.ListBuffer



//输入数据样例类
case class ApacheLogEvent( ip: String, //IP地址
                           userId: String, //用户ID
                           eventTime: Long, //用户点击广告时间
                           method: String, //请求方式
                           url: String) //请求的URL

// 窗口聚合结果样例类
case class UrlViewCount( url: String, //请求的URL
                         windowEnd: Long,  //所属窗口
                         count: Long ) //点击的次数


/**
 * 热门页面统计
 */
object HotPage {

  def main(args: Array[String]): Unit = {
    //获取执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //设置时间
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    env.readTextFile(Utils.eventLogPath)  //读取到数据
      .map( Utils.string2ApacheLogEvent(_)) //使用面向对象的思想,对数据进行解析
      .assignTimestampsAndWatermarks(new HotPageEventTimeExtractor) //设置水位,允许数据迟到10秒
      .keyBy(_.url) //根据请求页面进行分组
      .timeWindow(Time.minutes(5),Time.seconds(5)) //设置窗口
      .aggregate(new PageCountAgg(),new PageWindowResult) //窗口URL进行统计
      .keyBy(_.windowEnd) //按照窗口进行分组
      .process(new TopNHotPage(5)) //实现排序的逻辑
      .print()

    env.execute("hot page count")
  }

}

class TopNHotPage(topSize:Int) extends KeyedProcessFunction[Long,UrlViewCount,String]{
  //申明一个state,里面存储URL和对应出现的次数
  //TODO 这个地方用ListState也可以
  lazy val urlState:MapState[String,Long] =
  getRuntimeContext.getMapState(new MapStateDescriptor[String,Long](
    "url-state-count",classOf[String],classOf[Long]))

  override def processElement(value: UrlViewCount,
                              ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context,
                              out: Collector[String]): Unit = {
    //来一条数据就把数据给存起来
    urlState.put(value.url,value.count)
    //注册定时器
    ctx.timerService().registerEventTimeTimer(value.windowEnd + 1)
  }

  override def onTimer(timestamp: Long,
                       ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext,
                       out: Collector[String]): Unit = {
    //里面可以实现排序
    val allUrlViews:ListBuffer[(String,Long)] = new ListBuffer[(String, Long)]()

    val iter = urlState.entries().iterator()
    while(iter.hasNext){
      val entry = iter.next()
      allUrlViews += ((entry.getKey,entry.getValue))
    }
    //清空state
    urlState.clear()
    //使用降序排序,求TopN
    val sortedUrlView = allUrlViews.sortWith(_._2 > _._2).take(topSize)

    val result = new StringBuilder()
    result.append("时间:").append(new Timestamp( timestamp -1)).append("\n")
    sortedUrlView.foreach( view =>{
      result.append("URL:").append(view._1)
        .append(" 访问量:").append(view._2).append("\n")
    })
    result.append("===================")

    out.collect(result.toString())
  }
}

/**
 * 自定义窗口处理函数
 */
class PageWindowResult() extends WindowFunction[Long,UrlViewCount,String,TimeWindow]{
  override def apply(key: String, window: TimeWindow,
                     input: Iterable[Long],
                     out: Collector[UrlViewCount]): Unit = {
    //window.getEnd 标示我们的一个窗口
    out.collect(UrlViewCount(key,window.getEnd,input.iterator.next()))
  }
}

/**
 * 实现的是对URL进行聚合
 * sum
 * 辅助变量,累加变量
 */
class PageCountAgg() extends AggregateFunction[ApacheLogEvent,Long,Long]{
  override def createAccumulator(): Long = 0L

  override def add(in: ApacheLogEvent, acc: Long): Long = acc + 1

  override def merge(acc: Long, acc1: Long): Long = acc + acc1

  override def getResult(acc: Long): Long = acc
}


/**
 * 定义waterMark
 */
class HotPageEventTimeExtractor extends AssignerWithPeriodicWatermarks[ApacheLogEvent]{

  var currentMaxEventTime = 0L //设置当前窗口里面最大的时间
  val maxOufOfOrderness = 10000 //最大乱序时间 10s
  /**
   * 计算watermark
   * @return
   */
  override def getCurrentWatermark: Watermark = {
    new Watermark(currentMaxEventTime - maxOufOfOrderness)
  }

  /**
   * 指定我们的时间字段
   * @param element
   * @param previousElementTimestamp
   * @return
   */
  override def extractTimestamp(element: ApacheLogEvent, previousElementTimestamp: Long): Long = {
    //时间字段
    val timestamp = element.eventTime
    currentMaxEventTime = Math.max(element.eventTime, currentMaxEventTime)
    timestamp;
  }


}

3、Utils工具类

package utils

/**
 * @program: demo
 * @description: ${description}
 * @author: yang
 * @create: 2020-12-30 14:26
 */
import java.text.SimpleDateFormat

import service.{AdClickEvent, ApacheLogEvent, UserBehavior}


object Utils {

  //时间日志路径
  val eventLogPath = "E:\\java\\demo\\src\\main\\resources\\file\\data2.log"
  //广告点击日志路径
  val adClickLogPath = "E:\\java\\demo\\src\\main\\resources\\file\\data3.csv"

  //用户行为数据日志路径
  val userBehaviorLogPath="E:\\java\\demo\\src\\main\\resources\\file\\data1.csv"


  /**
   * 根据字符串把数据转换成为日志服务数据对象
   * @param line
   * @return
   */
  def string2ApacheLogEvent(line:String):ApacheLogEvent={
    val fields = line.split(" ")
    val dateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
    val timeStamp = dateFormat.parse(fields(3).trim).getTime
    ApacheLogEvent(fields(0).trim,fields(1).trim,timeStamp,
      fields(5).trim,fields(6).trim)
  }

  /**
   * 根据字符串生成广告点击日志对象
   * @param line
   * @return
   */
  def string2ClickEvent(line:String):AdClickEvent={
    val dataArray = line.split(",")
    AdClickEvent(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim, dataArray(3).trim, dataArray(4).trim.toLong)
  }

  /**
   * 根据字符串,把数据转换成为用户行为对象
   * @param line
   * @return
   */
  def string2UserBehavior(line:String):UserBehavior={
    val fields = line.split(",")
    UserBehavior(fields(0).trim.toLong,
      fields(1).trim.toLong,
      fields(2).trim.toLong,
      fields(3).trim,
      fields(4).trim.toLong,
      fields(5).trim
    )

  }

}

 

posted @ 2021-01-05 11:49  小白啊小白,Fighting  阅读(243)  评论(0编辑  收藏  举报