城市小区信息

要统计城市小区淘宝用户的覆盖率。数据部门貌似没统计这个数据,也许我没找到吧。jsoup抓下。

感谢http://hangzhou.haozu.com/无偿提供数据,一次性的廉价代码。自己去云梯搞数据真是烦,剩下的就交给数据分析的同事去帮忙吧。

 1 import org.jsoup.Jsoup;
 2 import org.jsoup.nodes.Document;
 3 import org.jsoup.nodes.Element;
 4 import org.jsoup.select.Elements;
 5 
 6 import java.io.FileOutputStream;
 7 import java.io.IOException;
 8 import java.io.PrintWriter;
 9 import java.util.HashMap;
10 import java.util.HashSet;
11 import java.util.Map;
12 import java.util.Set;
13 
14 /**
15  * Created with IntelliJ IDEA.
16  * User: zhangbin
17  * Date: 13-6-20
18  * Time: 下午10:11
19  * To change this template use File | Settings | File Templates.
20  */
21 public class Main {
22     public static void main(String[] args){
23         String[] cities = new String[]{"beijing","tianjin","dalian","sjz","heb","sy","ty","cc","shanghai","hangzhou","nanjing","jinan","qd"
24         ,"xz","shenzhen","guangzhou","cs","haikou","xm","chengdu","chongqing","wuhan","zhengzhou","xa","lz","ly","gy"
25         };
26         String[] citynames = new String[]{"北京","天津","大连","石家庄","哈尔滨","沈阳","太原","长春","上海","杭州","南京","济南","青岛","徐州","深圳","广州","长沙","海口","厦门"
27                 ,"成都","重庆","武汉","郑州","西安","兰州","洛阳","贵阳"
28         };
29         String middle = "haozu.com/community/";
30         String head = "http://";
31         String url = "";
32         Map<String,Set<String>> blocksMap = new HashMap<String,Set<String>>();
33         for(int i =0 ;i<cities.length;i++){
34            String city = cities[i];
35            String cityName = citynames[i];
36            url = head+city+"."+middle;
37            Set<String> blocks = new HashSet<String>();
38            for(int j=1;j<=10;j++){
39                try {
40                    Document doc = Jsoup.connect(url+"p"+j).get();
41                    Elements eles = doc.getElementsByClass("clist_name");
42                    for(Element ele : eles){
43                       Element tmp = ele.getElementsByTag("a").get(0);
44                       String block = tmp.text();
45                       int index = block.indexOf('(');
46                        if(index != -1){
47                            block = block.substring(0,index);
48                        }
49                        index = block.indexOf('(');
50                        if(index != -1){
51                            block = block.substring(0,index);
52                        }
53                        blocks.add(block);
54                    }
55                } catch (IOException e) {
56                    System.out.println("error");
57                }
58            }
59             blocksMap.put(cityName,blocks);
60         }
61         String lineSep = System.getProperty("line.separator");
62         try {
63             FileOutputStream fos = new FileOutputStream("/home/zhangbin/CityBlocks.data");
64             PrintWriter pw  = new PrintWriter(fos);
65             for(Map.Entry<String,Set<String>> entry : blocksMap.entrySet()){
66                 Set<String> set = entry.getValue();
67                     pw.write(lineSep+lineSep+entry.getKey()+lineSep+lineSep);
68                     for(String tmp : set){
69                         pw.write(tmp+lineSep);
70                     }
71                     pw.flush();
72 
73             }
74             pw.close();
75             fos.close();
76         } catch (Exception e) {
77             //ignore
78         }
79 
80     }
81 }

 

posted @ 2013-06-20 23:40  touchhy  阅读(343)  评论(0编辑  收藏  举报