爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)
https://github.com/congmingyige/web-crawler_rank-of-competition-in-JiSuanKe-and-hihocoder
1. 计蒜客(获取复制不了的数据)
1 import java.util.Scanner; 2 3 /** 4 * 无法从网页上获得源代码 5 */ 6 7 public class GetScore_jisuanke { 8 9 static String PREFIX_UNICODE= "\\u"; 10 static char ascii2Char(String str) { 11 if (str.length() != 6) { 12 throw new IllegalArgumentException("Ascii string of a native character must be 6 character."); 13 } 14 if (!PREFIX_UNICODE.equals(str.substring(0, 2))) { 15 throw new IllegalArgumentException("Ascii string of a native character must start with \"\\u\"."); 16 } 17 String tmp = str.substring(2, 4); // 将十六进制转为十进制 18 int code = Integer.parseInt(tmp, 16) << 8; // 转为高位,后与地位相加 19 tmp = str.substring(4, 6); 20 code += Integer.parseInt(tmp, 16); // 与低8为相加 21 return (char) code; 22 } 23 24 static String ascii2Native(String str) { 25 StringBuilder sb = new StringBuilder(); 26 int begin = 0; 27 int index = str.indexOf(PREFIX_UNICODE); 28 while (index != -1) { 29 sb.append(str.substring(begin, index)); 30 sb.append(ascii2Char(str.substring(index, index + 6))); 31 begin = index + 6; index = str.indexOf(PREFIX_UNICODE, begin); 32 } 33 sb.append(str.substring(begin)); 34 return sb.toString(); 35 } 36 37 /* 38 * unicode代码 来自 黑暗的笑 的CSDN 博客 ,全文地址请点击:https://blog.csdn.net/xia744510124/article/details/51322107?utm_source=copy 39 */ 40 41 public static void main(String[] args) { 42 Scanner in=new Scanner(System.in); 43 String str,s; 44 int s1,s2,s3; 45 String tag=new String("</script>"); 46 int x,y,sum_pro,i; 47 48 while ((str=in.nextLine())!=null) { 49 if (str.length()>=9 && str.substring(0,9).equals(tag)) { 50 51 s="problem_naming"; 52 x=str.indexOf(s); 53 x+=s.length()+3; 54 y=str.indexOf("]",x); 55 sum_pro=(y-x)/4; 56 57 System.out.print("team\tschool\tcount\ttime\t"); 58 for (i=0;i<sum_pro;i++) 59 System.out.print((char)(65+i)+"\t"); 60 System.out.println(); 61 62 y=str.indexOf("prev_page_url",y); 63 64 while (true) { 65 s="name"; 66 x=str.indexOf(s,y); 67 if (x==-1) 68 break; 69 x+=s.length()+3; 70 y=str.indexOf("\"",x); 71 System.out.print(str.substring(x,y)+"\t"); 72 73 s="school"; 74 x=str.indexOf(s,y); 75 x+=s.length()+3; 76 y=str.indexOf("\"",x); 77 System.out.print(ascii2Native(str.substring(x,y))+"\t"); 78 79 s="score"; 80 x=str.indexOf(s,y); 81 x+=s.length()+2; 82 y=str.indexOf(",",x); 83 System.out.print(str.substring(x,y)+"\t"); 84 85 s="cost"; 86 x=str.indexOf(s,y); 87 x+=s.length()+2; 88 y=str.indexOf(",",x); 89 System.out.print(str.substring(x,y)+"\t"); 90 91 // until not exists or ==cost -1 92 for (i=1;i<=sum_pro;i++) { 93 //cost":120,"exact_cost":7144,"submit_count":4,"problem_score":1,"score":0 94 s="cost\""; 95 x=str.indexOf(s,y); 96 x+=s.length()+1; //2-1 97 y=str.indexOf(",",x); 98 s1=Integer.valueOf(str.substring(x,y)); 99 100 s="exact_cost"; 101 x=str.indexOf(s,y); 102 x+=s.length()+2; 103 y=str.indexOf(",",x); 104 s2=Integer.valueOf(str.substring(x,y)); 105 106 s="submit_count"; 107 x=str.indexOf(s,y); 108 x+=s.length()+2; 109 y=str.indexOf(",",x); 110 s3=Integer.valueOf(str.substring(x,y)); 111 112 if (s2!=0) 113 System.out.print(s1); 114 else 115 System.out.print("——"); 116 System.out.print("("+s3+")\t"); 117 } 118 System.out.println(); 119 } 120 } 121 } 122 } 123 }
效果:
2. hihocoder(多页成绩整合在一起)
1 /** 2 * get source code: 3 * https://www.cnblogs.com/chaohu13/p/5337498.html 4 */ 5 import java.io.BufferedReader; 6 import java.io.InputStreamReader; 7 import java.net.HttpURLConnection; 8 import java.net.URL; 9 10 public class GetScore_hiho { 11 public static void main(String args[]){ 12 URL url; 13 int responsecode; 14 HttpURLConnection urlConnection; 15 BufferedReader reader; 16 String str,str1; 17 String tag=new String("<tr class=\"std-acm\">"); 18 String website; 19 //修改1 必须是"rank?page="形式 20 website=new String("http://hihocoder.com/contest/acmicpc2018beijingonline/rank?page=1"); 21 int x,y,i; 22 //修改2 23 int page=13; 24 int index=0; 25 Boolean vis; 26 27 vis=false; //首栏只用存在一次 28 for (index=1;index<=page;index++) { 29 try{ 30 //生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn 31 x=website.indexOf("="); 32 website=website.substring(0,x+1)+String.valueOf(index); 33 url=new URL(website); 34 35 //打开URL 36 urlConnection = (HttpURLConnection)url.openConnection(); 37 //获取服务器响应代码 38 responsecode=urlConnection.getResponseCode(); 39 if(responsecode==200){ 40 //得到输入流,即获得了网页的内容 41 reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"UTF-8"));//GBK 42 while((str=reader.readLine().trim())!=null){ 43 // System.out.println(str); //test 44 if (str.equals(tag)==true) { 45 str=reader.readLine().trim(); 46 x=str.indexOf(">"); 47 y=str.indexOf("<",x); 48 49 if (str.substring(x+1,y).equals("Rank")==true) { 50 if (vis==false) { 51 vis=true; 52 System.out.print(str.substring(x+1,y).trim()+"\t"); 53 while (true) { 54 str=reader.readLine().trim(); 55 if (str.equals("</tr>")==true) 56 break; 57 x=str.indexOf(">"); 58 y=str.indexOf("<",x); 59 System.out.print(str.substring(x+1,y).trim()+"\t"); 60 61 if ((x=str.indexOf(">",y))!=str.length()-1) { 62 y=str.indexOf("<",x); 63 System.out.print(str.substring(x+1,y).trim()+"\t"); 64 } 65 } 66 System.out.println(); 67 } 68 // System.exit(0); //test 69 } 70 else { 71 /* 72 * <td>1</td> 73 * <td>清华大学</td> 74 */ 75 System.out.print(str.substring(x+1,y).trim()+"\t"); 76 for (i=2;i<=2;i++) { //1+1 77 str=reader.readLine().trim(); 78 x=str.indexOf(">"); 79 y=str.indexOf("<",x); 80 System.out.print(str.substring(x+1,y).trim()+"\t"); 81 } 82 83 //<td><a class="fn-ell" style="display: block;" href="/user/109506">team181814</a></td> 84 str=reader.readLine().trim(); 85 x=str.indexOf(">",5); 86 y=str.indexOf("<",x); 87 System.out.print(str.substring(x+1,y).trim()+"\t"); 88 89 /* 90 * <td class="solved">8</td> 91 * <td>15:20:09</td> 92 */ 93 for (i=1;i<=2;i++) { 94 str=reader.readLine().trim(); 95 x=str.indexOf(">"); 96 y=str.indexOf("<",x); 97 System.out.print(str.substring(x+1,y).trim()+"\t"); 98 } 99 100 while (true) { 101 str=reader.readLine().trim(); 102 if (str.equals("</tr>")==true) 103 break; 104 str=reader.readLine().trim(); 105 str=reader.readLine().trim(); 106 if (str.equals("</td>")==true) 107 str=""; 108 else if (str.charAt(0)>='0' && str.charAt(0)<='9') { 109 x=str.indexOf("<br>"); 110 if (x!=-1) { 111 y=str.indexOf(")",x+4); 112 str=str.substring(0,7)+" "+str.substring(x+4,y+1); 113 str1=reader.readLine(); //读多一行 114 } 115 else 116 str=str.substring(0,7); 117 } 118 else { 119 x=str.indexOf(")"); 120 str=str.substring(0,x+1); 121 str1=reader.readLine(); //读多一行 122 } 123 System.out.print(str+"\t"); 124 } 125 System.out.println(); 126 } 127 // System.exit(0); //test 128 } 129 } 130 } 131 else{ 132 System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode); 133 } 134 } 135 catch(Exception e){ 136 //End Of Input 137 // System.out.println("获取不到网页的源码,出现异常:"+e); 138 } 139 } 140 141 142 } 143 } 144 /* 145 p=Pattern.compile("<td>|</td>"); 146 m=p.matcher(str); 147 str=m.replaceAll(""); 148 System.out.print(str+"\t"); 149 */
效果: