抓取网页信息并获取部分生成xml

一、获取网页信息

使用httpclient抓取

public static String clientTest(String url){
@SuppressWarnings("deprecation")
HttpClient hc=new DefaultHttpClient();
HttpGet get=new HttpGet(url);
String backContent="";
try {
HttpResponse response=hc.execute(get);
HttpEntity entity = response.getEntity();
if (entity != null) {

InputStream is = entity.getContent();
BufferedReader in = new BufferedReader(new InputStreamReader(is));
StringBuffer buffer = new StringBuffer();
String line = "";
while ((line = in.readLine()) != null) {
buffer.append(line);
}
//end 读取整个页面内容
backContent = buffer.toString();
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return backContent;
}

二、截取所需部分字符串

public static String sub(String str,String beginstr,String endstr){
int b=str.indexOf(beginstr);
int e=str.indexOf(endstr);
int le=endstr.length();
String result=str.substring(b, e+le);
return result;
}

三、过滤掉不符合xml规则的字符串

str=str.replaceAll("data-foldGroup=1", " ");

四、保存为xml

public static void saveFile(String str,String path){
File file=new File(path);
PrintWriter pfp=null;
try {
pfp= new PrintWriter(file);
pfp.print(str);
pfp.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
pfp.close();
}
}

最后的调用

public static void main(String[] args) {
String url="http://trend.caipiao.163.com/ssq/?beginPeriod=2015001&endPeriod=2015118";
String str=clientTest(url);
//System.out.println(str);
str=sub(str,"<tbody id=\"cpdata\">","</tbody>");
str=str.replaceAll("data-foldGroup=1", " ");
str=str.replaceAll("data-foldColor=ball_red", " ");
str=str.replaceAll("data-award=1", " ");
str=str.replaceAll("data-foldColor=ball_blue", " ");
System.out.println("<?xml version=\"1.0\" encoding=\"utf-8\"?>"+str);
String path="D:/java/hello.xml";
saveFile(str,path);
}

posted on 2015-10-10 14:00  hellovx  阅读(874)  评论(0编辑  收藏  举报

导航