Flink之热门页面统计
1、数据格式
83.149.9.123 - - 17/05/2020:10:05:03 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-search.png 83.149.9.123 - - 17/05/2020:10:05:43 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard3.png 83.149.9.123 - - 17/05/2020:10:05:47 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/highlight/highlight.js 83.149.9.123 - - 17/05/2020:10:05:12 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/zoom-js/zoom.js 83.149.9.123 - - 17/05/2020:10:05:07 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/notes/notes.js 83.149.9.123 - - 17/05/2020:10:05:34 +0000 GET /presentations/logstash-kafkamonitor-2020/images/sad-medic.png 83.149.9.123 - - 17/05/2020:10:05:57 +0000 GET /presentations/logstash-kafkamonitor-2020/css/fonts/Roboto-Bold.ttf 83.149.9.123 - - 17/05/2020:10:05:50 +0000 GET /presentations/logstash-kafkamonitor-2020/css/fonts/Roboto-Regular.ttf 83.149.9.123 - - 17/05/2020:10:05:24 +0000 GET /presentations/logstash-kafkamonitor-2020/images/frontend-response-codes.png 83.149.9.123 - - 17/05/2020:10:05:50 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard.png 83.149.9.123 - - 17/05/2020:10:05:46 +0000 GET /presentations/logstash-kafkamonitor-2020/images/Dreamhost_logo.svg 83.149.9.123 - - 17/05/2020:10:05:11 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard2.png 83.149.9.123 - - 17/05/2020:10:05:19 +0000 GET /presentations/logstash-kafkamonitor-2020/images/apache-icon.gif 83.149.9.123 - - 17/05/2020:10:05:33 +0000 GET /presentations/logstash-kafkamonitor-2020/images/nagios-sms5.png 83.149.9.123 - - 17/05/2020:10:05:00 +0000 GET /presentations/logstash-kafkamonitor-2020/images/redis.png 83.149.9.123 - - 17/05/2020:10:05:25 +0000 GET /presentations/logstash-kafkamonitor-2020/images/elasticsearch.png 83.149.9.123 - - 17/05/2020:10:05:59 +0000 GET /presentations/logstash-kafkamonitor-2020/images/logstashbook.png 83.149.9.123 - - 17/05/2020:10:05:30 +0000 GET /presentations/logstash-kafkamonitor-2020/images/github-contributions.png 83.149.9.123 - - 17/05/2020:10:05:53 +0000 GET /presentations/logstash-kafkamonitor-2020/css/print/paper.css 83.149.9.123 - - 17/05/2020:10:05:24 +0000 GET /presentations/logstash-kafkamonitor-2020/images/1983_delorean_dmc-12-pic-38289.jpeg 83.149.9.123 - - 17/05/2020:10:05:54 +0000 GET /presentations/logstash-kafkamonitor-2020/images/simple-inputs-filters-outputs.jpg 83.149.9.123 - - 17/05/2020:10:05:33 +0000 GET /presentations/logstash-kafkamonitor-2020/images/tiered-outputs-to-inputs.jpg 83.149.9.123 - - 17/05/2020:10:05:56 +0000 GET /favicon.ico 24.236.252.67 - - 17/05/2020:10:05:40 +0000 GET /favicon.ico 93.114.45.13 - - 17/05/2020:10:05:14 +0000 GET /articles/dynamic-dns-with-dhcp/ 93.114.45.13 - - 17/05/2020:10:05:04 +0000 GET /reset.css 93.114.45.13 - - 17/05/2020:10:05:45 +0000 GET /style2.css 93.114.45.13 - - 17/05/2020:10:05:14 +0000 GET /favicon.ico 93.114.45.13 - - 17/05/2020:10:05:17 +0000 GET /images/jordan-80.png 93.114.45.13 - - 17/05/2020:10:05:21 +0000 GET /images/web/2009/banner.png 66.249.73.135 - - 17/05/2020:10:05:40 +0000 GET /blog/tags/ipv6 50.16.19.13 - - 17/05/2020:10:05:10 +0000 GET /blog/tags/puppet?flav=rss20 66.249.73.185 - - 17/05/2020:10:05:37 +0000 GET / 110.136.166.128 - - 17/05/2020:10:05:35 +0000 GET /projects/xdotool/ 46.105.14.53 - - 17/05/2020:10:05:03 +0000 GET /blog/tags/puppet?flav=rss20 110.136.166.128 - - 17/05/2020:10:05:06 +0000 GET /reset.css 110.136.166.128 - - 17/05/2020:10:05:03 +0000 GET /style2.css 110.136.166.128 - - 17/05/2020:10:05:41 +0000 GET /favicon.ico 110.136.166.128 - - 17/05/2020:10:05:32 +0000 GET /images/jordan-80.png 123.125.71.35 - - 17/05/2020:10:05:46 +0000 GET /blog/tags/release 110.136.166.128 - - 17/05/2020:10:05:08 +0000 GET /images/web/2009/banner.png 50.150.204.184 - - 17/05/2020:10:05:46 +0000 GET /images/googledotcom.png 207.241.237.225 - - 17/05/2020:10:05:58 +0000 GET /blog/tags/examples 200.49.190.101 - - 17/05/2020:10:05:36 +0000 GET /reset.css 200.49.190.100 - - 17/05/2020:10:05:38 +0000 GET /blog/tags/web 200.49.190.101 - - 17/05/2020:10:05:11 +0000 GET /style2.css 200.49.190.101 - - 17/05/2020:10:05:37 +0000 GET /images/jordan-80.png 66.249.73.185 - - 17/05/2020:10:05:00 +0000 GET /reset.css 66.249.73.135 - - 17/05/2020:10:05:16 +0000 GET /blog/tags/munin 66.249.73.135 - - 17/05/2020:10:05:33 +0000 GET /blog/tags/firefox?flav=rss20 66.249.73.135 - - 17/05/2020:10:05:17 +0000 GET /blog/geekery/eventdb-ideas.html 67.214.178.190 - - 17/05/2020:10:05:48 +0000 GET / 67.214.178.190 - - 17/05/2020:10:05:18 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html 207.241.237.220 - - 17/05/2020:10:05:28 +0000 GET /blog/tags/projects 46.105.14.53 - - 17/05/2020:10:05:44 +0000 GET /blog/tags/puppet?flav=rss20 207.241.237.227 - - 17/05/2020:10:05:47 +0000 GET /blog/geekery/soekris-gpio.html 91.177.205.119 - - 17/05/2020:10:05:22 +0000 GET /blog/geekery/xvfb-firefox.html 91.177.205.119 - - 17/05/2020:10:05:34 +0000 GET /reset.css 91.177.205.119 - - 17/05/2020:10:05:37 +0000 GET /style2.css 91.177.205.119 - - 17/05/2020:10:05:54 +0000 GET /images/jordan-80.png 91.177.205.119 - - 17/05/2020:10:05:31 +0000 GET /images/web/2009/banner.png 91.177.205.119 - - 17/05/2020:10:05:32 +0000 GET /favicon.ico 66.249.73.185 - - 17/05/2020:10:05:22 +0000 GET /doc/index.html?org/elasticsearch/action/search/SearchResponse.html 207.241.237.228 - - 17/05/2020:10:05:40 +0000 GET /blog/tags/defcon 207.241.237.101 - - 17/05/2020:10:05:51 +0000 GET /blog/tags/regex 87.169.99.232 - - 17/05/2020:10:05:59 +0000 GET /presentations/puppet-at-loggly/puppet-at-loggly.pdf.html 209.85.238.199 - - 17/05/2020:10:05:30 +0000 GET /blog/tags/firefox?flav=rss20 209.85.238.199 - - 17/05/2020:10:05:15 +0000 GET /test.xml 81.220.24.207 - - 17/05/2020:10:05:13 +0000 GET /blog/geekery/ssl-latency.html 81.220.24.207 - - 17/05/2020:10:05:44 +0000 GET /reset.css 81.220.24.207 - - 17/05/2020:10:05:26 +0000 GET /images/jordan-80.png 81.220.24.207 - - 17/05/2020:10:05:39 +0000 GET /style2.css 81.220.24.207 - - 17/05/2020:10:05:52 +0000 GET /images/web/2009/banner.png 81.220.24.207 - - 17/05/2020:10:05:21 +0000 GET /favicon.ico 66.249.73.135 - - 17/05/2020:11:05:17 +0000 GET /blog/geekery/vmware-cpu-performance.html 46.105.14.53 - - 17/05/2020:11:05:42 +0000 GET /blog/tags/puppet?flav=rss20 218.30.103.62 - - 17/05/2020:11:05:11 +0000 GET /robots.txt 218.30.103.62 - - 17/05/2020:11:05:46 +0000 GET /robots.txt 218.30.103.62 - - 17/05/2020:11:05:45 +0000 GET /projects/fex/ 74.125.40.20 - - 17/05/2020:11:05:59 +0000 GET /?flav=rss20 71.212.224.97 - - 17/05/2020:11:05:05 +0000 GET /projects/xdotool/ 71.212.224.97 - - 17/05/2020:11:05:15 +0000 GET /reset.css 71.212.224.97 - - 17/05/2020:11:05:22 +0000 GET /style2.css 71.212.224.97 - - 17/05/2020:11:05:11 +0000 GET /images/jordan-80.png 71.212.224.97 - - 17/05/2020:11:05:28 +0000 GET /images/web/2009/banner.png 218.30.103.62 - - 17/05/2020:11:05:17 +0000 GET /projects/xdotool/xdotool.xhtml 108.174.55.234 - - 17/05/2020:11:05:26 +0000 GET /?flav=rss20 218.30.103.62 - - 17/05/2020:11:05:37 +0000 GET /blog/geekery/c-vs-python-bdb.html 121.107.188.202 - - 17/05/2020:11:05:09 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard3.png 218.30.103.62 - - 17/05/2020:11:05:39 +0000 GET /blog/productivity/better-zsh-xterm-title-fix.html 218.30.103.62 - - 17/05/2020:11:05:11 +0000 GET /blog/geekery/xvfb-firefox.html 218.30.103.62 - - 17/05/2020:11:05:00 +0000 GET /blog/geekery/puppet-facts-into-mcollective.html 198.46.149.143 - - 17/05/2020:11:05:10 +0000 GET /blog/geekery/disabling-battery-in-ubuntu-vms.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29 198.46.149.143 - - 17/05/2020:11:05:48 +0000 GET /blog/geekery/solving-good-or-bad-problems.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29 218.30.103.62 - - 17/05/2020:11:05:28 +0000 GET /blog/geekery/jquery-interface-puffer.html%20target= 218.30.103.62 - - 17/05/2020:11:05:05 +0000 GET /blog/geekery/ec2-reserved-vs-ondemand.html 66.249.73.135 - - 17/05/2020:11:05:31 +0000 GET /blog/web/firefox-scrolling-fix.html 86.1.76.62 - - 17/05/2020:11:05:36 +0000 GET /projects/xdotool/ 86.1.76.62 - - 17/05/2020:11:05:25 +0000 GET /reset.css 86.1.76.62 - - 17/05/2020:11:05:19 +0000 GET /style2.css 86.1.76.62 - - 17/05/2020:11:05:03 +0000 GET /favicon.ico 86.1.76.62 - - 17/05/2020:11:05:28 +0000 GET /images/jordan-80.png 86.1.76.62 - - 17/05/2020:11:05:07 +0000 GET /images/web/2009/banner.png 66.249.73.135 - - 17/05/2020:11:05:58 +0000 GET /blog/tags/bdb 107.170.41.69 - - 17/05/2020:11:05:31 +0000 GET /?flav=atom 50.16.19.13 - - 17/05/2020:11:05:14 +0000 GET /blog/tags/puppet?flav=rss20 46.105.14.53 - - 17/05/2020:11:05:02 +0000 GET /blog/tags/puppet?flav=rss20 208.115.111.72 - - 17/05/2020:11:05:26 +0000 GET /blog/rants/fedora-yum.html 208.115.111.72 - - 17/05/2020:11:05:32 +0000 GET /blog/tags/grok 208.115.111.72 - - 17/05/2020:11:05:12 +0000 GET /blog/tags/is%20it%20done%20yet 208.115.111.72 - - 17/05/2020:11:05:07 +0000 GET /blog/tags/statistics 50.180.79.170 - - 17/05/2020:11:05:50 +0000 GET /favicon.ico 208.115.111.72 - - 17/05/2020:11:05:05 +0000 GET /blog/tags/subversion 208.115.111.72 - - 17/05/2020:11:05:52 +0000 GET /blog/web/194.html 208.115.111.72 - - 17/05/2020:11:05:23 +0000 GET /files/blogposts/20070901/?C=D;O=A 208.115.111.72 - - 17/05/2020:11:05:15 +0000 GET /files/blogposts/20080109/boost_xpressive_test.cpp 208.115.111.72 - - 17/05/2020:11:05:38 +0000 GET /files/blogposts/20090520/ 208.115.111.72 - - 17/05/2020:11:05:41 +0000 GET /files/fastsplit/?C=M;O=D 208.115.111.72 - - 17/05/2020:11:05:19 +0000 GET /files/xdotool/docs/man/?C=M;O=D 208.115.111.72 - - 17/05/2020:11:05:16 +0000 GET /scripts/python/wrap/?C=N;O=D 208.115.111.72 - - 17/05/2020:11:05:32 +0000 GET /files/images/?C=S;O=D 208.115.111.72 - - 17/05/2020:11:05:00 +0000 GET /files/blogposts/20080611/ 208.115.111.72 - - 17/05/2020:11:05:16 +0000 GET /files/logstash/?C=D;O=D 208.115.111.72 - - 17/05/2020:11:05:53 +0000 GET /presentations/hackday06/ 208.115.111.72 - - 17/05/2020:11:05:29 +0000 GET /scripts/grok-py-test/ 208.115.111.72 - - 17/05/2020:11:05:08 +0000 GET /?N=A&page=21 208.115.111.72 - - 17/05/2020:11:05:49 +0000 GET /blog/geekery/oniguruma-named-capture-example.html?commentlimit=0 208.115.111.72 - - 17/05/2020:11:05:01 +0000 GET /blog/geekery/ssh-key-invalid-hack.html?commentlimit=0 208.115.111.72 - - 17/05/2020:11:05:31 +0000 GET /blog/geekery/server-side-javascript.html 208.115.111.72 - - 17/05/2020:11:05:15 +0000 GET /blog/geekery/yahoo-hackday-08.html 105.235.130.196 - - 17/05/2020:11:05:01 +0000 GET /images/googledotcom.png 174.37.205.76 - - 17/05/2020:11:05:19 +0000 GET /blog 54.255.13.204 - - 17/05/2020:11:05:03 +0000 GET /articles/ssh-security/ 105.235.130.196 - - 17/05/2020:11:05:45 +0000 GET /blog/tags/X11 54.255.13.204 - - 17/05/2020:11:05:55 +0000 GET /reset.css 54.255.13.204 - - 17/05/2020:11:05:32 +0000 GET /style2.css 54.255.13.204 - - 17/05/2020:11:05:10 +0000 GET /favicon.ico 105.235.130.196 - - 17/05/2020:11:05:20 +0000 GET /reset.css 54.255.13.204 - - 17/05/2020:11:05:46 +0000 GET /images/jordan-80.png 54.255.13.204 - - 17/05/2020:11:05:17 +0000 GET /images/web/2009/banner.png 105.235.130.196 - - 17/05/2020:11:05:47 +0000 GET /style2.css 105.235.130.196 - - 17/05/2020:11:05:37 +0000 GET /images/jordan-80.png 105.235.130.196 - - 17/05/2020:11:05:22 +0000 GET /images/web/2009/banner.png 134.76.249.10 - - 17/05/2020:11:05:01 +0000 GET /projects/xdotool/ 134.76.249.10 - - 17/05/2020:11:05:09 +0000 GET /reset.css 134.76.249.10 - - 17/05/2020:11:05:57 +0000 GET /style2.css 134.76.249.10 - - 17/05/2020:11:05:23 +0000 GET /favicon.ico 134.76.249.10 - - 17/05/2020:11:05:40 +0000 GET /images/jordan-80.png 134.76.249.10 - - 17/05/2020:11:05:50 +0000 GET /images/web/2009/banner.png 134.76.249.10 - - 17/05/2020:11:05:47 +0000 GET /projects/xdotool 134.76.249.10 - - 17/05/2020:11:05:13 +0000 GET /projects/xdotool/ 66.249.73.135 - - 17/05/2020:11:05:26 +0000 GET /?flav=atom 207.241.237.220 - - 17/05/2020:11:05:24 +0000 GET /blog/tags/C?page=2 68.184.202.186 - - 17/05/2020:11:05:28 +0000 GET /projects/xpathtool/ 68.184.202.186 - - 17/05/2020:11:05:02 +0000 GET /reset.css 68.184.202.186 - - 17/05/2020:11:05:05 +0000 GET /images/jordan-80.png 68.184.202.186 - - 17/05/2020:11:05:02 +0000 GET /style2.css 68.184.202.186 - - 17/05/2020:11:05:37 +0000 GET /images/web/2009/banner.png 68.184.202.186 - - 17/05/2020:11:05:58 +0000 GET /favicon.ico 46.105.14.53 - - 17/05/2020:11:05:29 +0000 GET /blog/tags/puppet?flav=rss20 66.249.73.135 - - 17/05/2020:11:05:00 +0000 GET /?flav=rss20 24.233.162.179 - - 17/05/2020:11:05:31 +0000 GET /favicon.ico 123.125.71.117 - - 17/05/2020:11:05:16 +0000 GET / 220.181.108.153 - - 17/05/2020:11:05:09 +0000 GET / 65.19.138.34 - - 17/05/2020:11:05:40 +0000 GET / 66.249.73.135 - - 17/05/2020:11:05:32 +0000 GET /blog/geekery/rhapsody-on-linux.html 97.116.185.190 - - 17/05/2020:11:05:59 +0000 GET /articles/dynamic-dns-with-dhcp/ 97.116.185.190 - - 17/05/2020:11:05:39 +0000 GET /reset.css 97.116.185.190 - - 17/05/2020:11:05:29 +0000 GET /style2.css 97.116.185.190 - - 17/05/2020:11:05:39 +0000 GET /images/jordan-80.png 97.116.185.190 - - 17/05/2020:11:05:02 +0000 GET /images/web/2009/banner.png 97.116.185.190 - - 17/05/2020:11:05:35 +0000 GET /favicon.ico 5.255.72.168 - - 17/05/2020:11:05:21 +0000 GET / 5.255.72.168 - - 17/05/2020:11:05:08 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html 46.105.14.53 - - 17/05/2020:11:05:33 +0000 GET /blog/tags/puppet?flav=rss20 5.102.173.71 - - 17/05/2020:11:05:13 +0000 GET /robots.txt 5.102.173.71 - - 17/05/2020:11:05:06 +0000 GET /projects/xdotool/ 208.91.156.11 - - 17/05/2020:11:05:05 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar 66.249.73.185 - - 17/05/2020:11:05:58 +0000 GET /presentations/logstash-1/ 74.125.176.81 - - 17/05/2020:11:05:28 +0000 GET /?flav=rss20 66.249.73.135 - - 17/05/2020:11:05:14 +0000 GET /blog/geekery/xdotool-2.20110530.html 187.45.193.158 - - 17/05/2020:11:05:54 +0000 GET /presentations/logstash-1/file/about-me/tequila-face.jpg 90.220.199.149 - - 17/05/2020:11:05:18 +0000 GET /blog/geekery/puppet-manage-homedirectory-contents.html 90.220.199.149 - - 17/05/2020:11:05:24 +0000 GET /reset.css 90.220.199.149 - - 17/05/2020:11:05:50 +0000 GET /style2.css 90.220.199.149 - - 17/05/2020:12:05:37 +0000 GET /images/jordan-80.png 90.220.199.149 - - 17/05/2020:12:05:21 +0000 GET /images/web/2009/banner.png 90.220.199.149 - - 17/05/2020:12:05:17 +0000 GET /favicon.ico 36.38.8.174 - - 17/05/2020:12:05:24 +0000 GET /blog/geekery/ssl-latency.html 36.38.8.174 - - 17/05/2020:12:05:36 +0000 GET /reset.css 36.38.8.174 - - 17/05/2020:12:05:14 +0000 GET /style2.css 36.38.8.174 - - 17/05/2020:12:05:44 +0000 GET /images/jordan-80.png 36.38.8.174 - - 17/05/2020:12:05:17 +0000 GET /images/web/2009/banner.png 36.38.8.174 - - 17/05/2020:12:05:39 +0000 GET /favicon.ico 71.207.12.53 - - 17/05/2020:12:05:17 +0000 GET /favicon.ico 220.241.45.142 - - 17/05/2020:12:05:07 +0000 GET /robots.txt 220.241.45.142 - - 17/05/2020:12:05:30 +0000 GET /projects/firefox-tabsearch/ 209.85.238.199 - - 17/05/2020:12:05:21 +0000 GET /?flav=atom 46.105.14.53 - - 17/05/2020:12:05:53 +0000 GET /blog/tags/puppet?flav=rss20 66.249.73.135 - - 17/05/2020:12:05:28 +0000 GET /blog/tags/noise
2、处理主类
package service /** * @program: demo * @description: ${description} * @author: yang * @create: 2020-12-30 14:28 */ import java.sql.Timestamp import org.apache.flink.api.common.functions.AggregateFunction import org.apache.flink.api.common.state.{MapState, MapStateDescriptor} import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, KeyedProcessFunction} import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.scala.function.WindowFunction import org.apache.flink.streaming.api.watermark.Watermark import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector import utils.Utils import scala.collection.mutable.ListBuffer //输入数据样例类 case class ApacheLogEvent( ip: String, //IP地址 userId: String, //用户ID eventTime: Long, //用户点击广告时间 method: String, //请求方式 url: String) //请求的URL // 窗口聚合结果样例类 case class UrlViewCount( url: String, //请求的URL windowEnd: Long, //所属窗口 count: Long ) //点击的次数 /** * 热门页面统计 */ object HotPage { def main(args: Array[String]): Unit = { //获取执行环境 val env = StreamExecutionEnvironment.getExecutionEnvironment //设置时间 env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) env.setParallelism(1) env.readTextFile(Utils.eventLogPath) //读取到数据 .map( Utils.string2ApacheLogEvent(_)) //使用面向对象的思想,对数据进行解析 .assignTimestampsAndWatermarks(new HotPageEventTimeExtractor) //设置水位,允许数据迟到10秒 .keyBy(_.url) //根据请求页面进行分组 .timeWindow(Time.minutes(5),Time.seconds(5)) //设置窗口 .aggregate(new PageCountAgg(),new PageWindowResult) //窗口URL进行统计 .keyBy(_.windowEnd) //按照窗口进行分组 .process(new TopNHotPage(5)) //实现排序的逻辑 .print() env.execute("hot page count") } } class TopNHotPage(topSize:Int) extends KeyedProcessFunction[Long,UrlViewCount,String]{ //申明一个state,里面存储URL和对应出现的次数 //TODO 这个地方用ListState也可以 lazy val urlState:MapState[String,Long] = getRuntimeContext.getMapState(new MapStateDescriptor[String,Long]( "url-state-count",classOf[String],classOf[Long])) override def processElement(value: UrlViewCount, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context, out: Collector[String]): Unit = { //来一条数据就把数据给存起来 urlState.put(value.url,value.count) //注册定时器 ctx.timerService().registerEventTimeTimer(value.windowEnd + 1) } override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = { //里面可以实现排序 val allUrlViews:ListBuffer[(String,Long)] = new ListBuffer[(String, Long)]() val iter = urlState.entries().iterator() while(iter.hasNext){ val entry = iter.next() allUrlViews += ((entry.getKey,entry.getValue)) } //清空state urlState.clear() //使用降序排序,求TopN val sortedUrlView = allUrlViews.sortWith(_._2 > _._2).take(topSize) val result = new StringBuilder() result.append("时间:").append(new Timestamp( timestamp -1)).append("\n") sortedUrlView.foreach( view =>{ result.append("URL:").append(view._1) .append(" 访问量:").append(view._2).append("\n") }) result.append("===================") out.collect(result.toString()) } } /** * 自定义窗口处理函数 */ class PageWindowResult() extends WindowFunction[Long,UrlViewCount,String,TimeWindow]{ override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = { //window.getEnd 标示我们的一个窗口 out.collect(UrlViewCount(key,window.getEnd,input.iterator.next())) } } /** * 实现的是对URL进行聚合 * sum * 辅助变量,累加变量 */ class PageCountAgg() extends AggregateFunction[ApacheLogEvent,Long,Long]{ override def createAccumulator(): Long = 0L override def add(in: ApacheLogEvent, acc: Long): Long = acc + 1 override def merge(acc: Long, acc1: Long): Long = acc + acc1 override def getResult(acc: Long): Long = acc } /** * 定义waterMark */ class HotPageEventTimeExtractor extends AssignerWithPeriodicWatermarks[ApacheLogEvent]{ var currentMaxEventTime = 0L //设置当前窗口里面最大的时间 val maxOufOfOrderness = 10000 //最大乱序时间 10s /** * 计算watermark * @return */ override def getCurrentWatermark: Watermark = { new Watermark(currentMaxEventTime - maxOufOfOrderness) } /** * 指定我们的时间字段 * @param element * @param previousElementTimestamp * @return */ override def extractTimestamp(element: ApacheLogEvent, previousElementTimestamp: Long): Long = { //时间字段 val timestamp = element.eventTime currentMaxEventTime = Math.max(element.eventTime, currentMaxEventTime) timestamp; } }
3、Utils工具类
package utils /** * @program: demo * @description: ${description} * @author: yang * @create: 2020-12-30 14:26 */ import java.text.SimpleDateFormat import service.{AdClickEvent, ApacheLogEvent, UserBehavior} object Utils { //时间日志路径 val eventLogPath = "E:\\java\\demo\\src\\main\\resources\\file\\data2.log" //广告点击日志路径 val adClickLogPath = "E:\\java\\demo\\src\\main\\resources\\file\\data3.csv" //用户行为数据日志路径 val userBehaviorLogPath="E:\\java\\demo\\src\\main\\resources\\file\\data1.csv" /** * 根据字符串把数据转换成为日志服务数据对象 * @param line * @return */ def string2ApacheLogEvent(line:String):ApacheLogEvent={ val fields = line.split(" ") val dateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss") val timeStamp = dateFormat.parse(fields(3).trim).getTime ApacheLogEvent(fields(0).trim,fields(1).trim,timeStamp, fields(5).trim,fields(6).trim) } /** * 根据字符串生成广告点击日志对象 * @param line * @return */ def string2ClickEvent(line:String):AdClickEvent={ val dataArray = line.split(",") AdClickEvent(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim, dataArray(3).trim, dataArray(4).trim.toLong) } /** * 根据字符串,把数据转换成为用户行为对象 * @param line * @return */ def string2UserBehavior(line:String):UserBehavior={ val fields = line.split(",") UserBehavior(fields(0).trim.toLong, fields(1).trim.toLong, fields(2).trim.toLong, fields(3).trim, fields(4).trim.toLong, fields(5).trim ) } }
本文来自博客园,作者:小白啊小白,Fighting,转载请注明原文链接:https://www.cnblogs.com/ywjfx/p/14234937.html