爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)

 

https://github.com/congmingyige/web-crawler_rank-of-competition-in-JiSuanKe-and-hihocoder

 

1. 计蒜客(获取复制不了的数据)

  1 import java.util.Scanner;
  2 
  3 /**
  4  * 无法从网页上获得源代码
  5  */
  6 
  7 public class GetScore_jisuanke {
  8     
  9     static String PREFIX_UNICODE= "\\u";
 10     static char ascii2Char(String str) { 
 11         if (str.length() != 6) { 
 12             throw new IllegalArgumentException("Ascii string of a native character must be 6 character."); 
 13         } 
 14         if (!PREFIX_UNICODE.equals(str.substring(0, 2))) { 
 15             throw new IllegalArgumentException("Ascii string of a native character must start with \"\\u\"."); 
 16         } 
 17         String tmp = str.substring(2, 4); // 将十六进制转为十进制 
 18         int code = Integer.parseInt(tmp, 16) << 8; // 转为高位,后与地位相加 
 19         tmp = str.substring(4, 6); 
 20         code += Integer.parseInt(tmp, 16); // 与低8为相加 
 21         return (char) code; 
 22     } 
 23     
 24     static String ascii2Native(String str) { 
 25         StringBuilder sb = new StringBuilder(); 
 26         int begin = 0; 
 27         int index = str.indexOf(PREFIX_UNICODE); 
 28         while (index != -1) { 
 29             sb.append(str.substring(begin, index)); 
 30             sb.append(ascii2Char(str.substring(index, index + 6))); 
 31             begin = index + 6; index = str.indexOf(PREFIX_UNICODE, begin); 
 32         } 
 33         sb.append(str.substring(begin)); 
 34         return sb.toString(); 
 35     }
 36 
 37     /*
 38      * unicode代码  来自 黑暗的笑 的CSDN 博客 ,全文地址请点击:https://blog.csdn.net/xia744510124/article/details/51322107?utm_source=copy 
 39      */
 40     
 41     public static void main(String[] args) {
 42         Scanner in=new Scanner(System.in);
 43         String str,s;
 44         int s1,s2,s3;
 45         String tag=new String("</script>");
 46         int x,y,sum_pro,i;
 47         
 48         while ((str=in.nextLine())!=null) {
 49             if (str.length()>=9 && str.substring(0,9).equals(tag)) {
 50 
 51                 s="problem_naming";
 52                 x=str.indexOf(s);
 53                 x+=s.length()+3;
 54                 y=str.indexOf("]",x);
 55                 sum_pro=(y-x)/4;
 56 
 57                 System.out.print("team\tschool\tcount\ttime\t");
 58                 for (i=0;i<sum_pro;i++)
 59                     System.out.print((char)(65+i)+"\t");
 60                 System.out.println();
 61                 
 62                 y=str.indexOf("prev_page_url",y);
 63                 
 64                 while (true) {
 65                     s="name";
 66                     x=str.indexOf(s,y);
 67                     if (x==-1)
 68                         break;
 69                     x+=s.length()+3;
 70                     y=str.indexOf("\"",x);
 71                     System.out.print(str.substring(x,y)+"\t");
 72                     
 73                     s="school";
 74                     x=str.indexOf(s,y);
 75                     x+=s.length()+3;
 76                     y=str.indexOf("\"",x);
 77                     System.out.print(ascii2Native(str.substring(x,y))+"\t");
 78                     
 79                     s="score";
 80                     x=str.indexOf(s,y);
 81                     x+=s.length()+2;
 82                     y=str.indexOf(",",x);
 83                     System.out.print(str.substring(x,y)+"\t");
 84 
 85                     s="cost";
 86                     x=str.indexOf(s,y);
 87                     x+=s.length()+2;
 88                     y=str.indexOf(",",x);
 89                     System.out.print(str.substring(x,y)+"\t");
 90                     
 91                     // until not exists or ==cost  -1
 92                     for (i=1;i<=sum_pro;i++) {
 93                         //cost":120,"exact_cost":7144,"submit_count":4,"problem_score":1,"score":0
 94                         s="cost\"";
 95                         x=str.indexOf(s,y);
 96                         x+=s.length()+1;    //2-1
 97                         y=str.indexOf(",",x);
 98                         s1=Integer.valueOf(str.substring(x,y));
 99 
100                         s="exact_cost";
101                         x=str.indexOf(s,y);
102                         x+=s.length()+2;
103                         y=str.indexOf(",",x);
104                         s2=Integer.valueOf(str.substring(x,y));
105                         
106                         s="submit_count";
107                         x=str.indexOf(s,y);
108                         x+=s.length()+2;
109                         y=str.indexOf(",",x);
110                         s3=Integer.valueOf(str.substring(x,y));
111                         
112                         if (s2!=0)
113                             System.out.print(s1);
114                         else
115                             System.out.print("——");
116                         System.out.print("("+s3+")\t");
117                     }
118                     System.out.println();
119                 }
120             }
121         }
122     }
123 }

效果:

 

2. hihocoder(多页成绩整合在一起)

  1 /**
  2  * get source code:
  3  * https://www.cnblogs.com/chaohu13/p/5337498.html
  4  */
  5 import java.io.BufferedReader;
  6 import java.io.InputStreamReader;
  7 import java.net.HttpURLConnection;
  8 import java.net.URL;
  9 
 10 public class GetScore_hiho {
 11     public static void main(String args[]){    
 12         URL url;
 13         int responsecode;
 14         HttpURLConnection urlConnection;
 15         BufferedReader reader;
 16         String str,str1;
 17         String tag=new String("<tr class=\"std-acm\">");
 18         String website;
 19     //修改1 必须是"rank?page="形式
 20         website=new String("http://hihocoder.com/contest/acmicpc2018beijingonline/rank?page=1");
 21         int x,y,i;
 22     //修改2
 23         int page=13;
 24     int index=0;
 25         Boolean vis;
 26         
 27         vis=false;    //首栏只用存在一次
 28         for (index=1;index<=page;index++) {
 29             try{
 30                 //生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
 31                 x=website.indexOf("=");
 32                 website=website.substring(0,x+1)+String.valueOf(index);
 33                 url=new URL(website);
 34                 
 35                 //打开URL
 36                 urlConnection = (HttpURLConnection)url.openConnection();
 37                 //获取服务器响应代码
 38                 responsecode=urlConnection.getResponseCode();
 39                 if(responsecode==200){
 40                     //得到输入流,即获得了网页的内容
 41                     reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"UTF-8"));//GBK
 42                     while((str=reader.readLine().trim())!=null){
 43     //                    System.out.println(str);    //test
 44                         if (str.equals(tag)==true) {
 45                             str=reader.readLine().trim();
 46                             x=str.indexOf(">");
 47                             y=str.indexOf("<",x);
 48 
 49                             if (str.substring(x+1,y).equals("Rank")==true) {
 50                                 if (vis==false) {
 51                                     vis=true;
 52                                     System.out.print(str.substring(x+1,y).trim()+"\t");
 53                                     while (true) {
 54                                         str=reader.readLine().trim();
 55                                         if (str.equals("</tr>")==true)
 56                                             break;
 57                                         x=str.indexOf(">");
 58                                         y=str.indexOf("<",x);
 59                                         System.out.print(str.substring(x+1,y).trim()+"\t");
 60                                         
 61                                         if ((x=str.indexOf(">",y))!=str.length()-1) {
 62                                             y=str.indexOf("<",x);
 63                                             System.out.print(str.substring(x+1,y).trim()+"\t");
 64                                         }
 65                                     }
 66                                     System.out.println();
 67                                 }
 68 //                                System.exit(0);    //test
 69                             }
 70                             else {
 71                                 /*
 72                                  * <td>1</td>
 73                                  * <td>清华大学</td>
 74                                  */
 75                                 System.out.print(str.substring(x+1,y).trim()+"\t");
 76                                 for (i=2;i<=2;i++) {    //1+1
 77                                     str=reader.readLine().trim();
 78                                     x=str.indexOf(">");
 79                                     y=str.indexOf("<",x);
 80                                     System.out.print(str.substring(x+1,y).trim()+"\t");    
 81                                 }
 82         
 83                                 //<td><a class="fn-ell" style="display: block;" href="/user/109506">team181814</a></td>
 84                                 str=reader.readLine().trim();
 85                                 x=str.indexOf(">",5);
 86                                 y=str.indexOf("<",x);
 87                                 System.out.print(str.substring(x+1,y).trim()+"\t");    
 88         
 89                                 /*
 90                                  * <td class="solved">8</td>
 91                                  * <td>15:20:09</td>
 92                                  */
 93                                 for (i=1;i<=2;i++) {
 94                                     str=reader.readLine().trim();
 95                                     x=str.indexOf(">");
 96                                     y=str.indexOf("<",x);
 97                                     System.out.print(str.substring(x+1,y).trim()+"\t");    
 98                                 }
 99                                 
100                                 while (true) {
101                                     str=reader.readLine().trim();
102                                     if (str.equals("</tr>")==true)
103                                         break;
104                                     str=reader.readLine().trim();
105                                     str=reader.readLine().trim();
106                                     if (str.equals("</td>")==true)
107                                         str="";
108                                     else if (str.charAt(0)>='0' && str.charAt(0)<='9') {
109                                         x=str.indexOf("<br>");
110                                         if (x!=-1) {
111                                             y=str.indexOf(")",x+4);
112                                             str=str.substring(0,7)+" "+str.substring(x+4,y+1);
113                                             str1=reader.readLine();    //读多一行
114                                         }
115                                         else
116                                             str=str.substring(0,7);
117                                     }
118                                     else {
119                                         x=str.indexOf(")");
120                                         str=str.substring(0,x+1);
121                                         str1=reader.readLine();    //读多一行
122                                     }
123                                     System.out.print(str+"\t");
124                                 }
125                                 System.out.println();
126                             }
127 //                            System.exit(0);    //test
128                         }
129                     }
130                 }
131                 else{
132                     System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode);
133                 }
134             }
135             catch(Exception e){
136                 //End Of Input
137     //            System.out.println("获取不到网页的源码,出现异常:"+e);
138             }
139         }
140         
141 
142     }
143 }
144 /*
145                         p=Pattern.compile("<td>|</td>");
146                         m=p.matcher(str);
147                         str=m.replaceAll("");
148                         System.out.print(str+"\t");
149 */

 

效果:

 

posted @ 2018-09-24 15:31  congmingyige  阅读(655)  评论(0编辑  收藏  举报