本周是开学的第四周,主要学习了数据的地域处理,分类处理,提取关键词处理等。
学会了百度地图API,jieba,和python的相关知识。
package com.diyu; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.List; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; public class getregion{ /* * * * * 地址编码 * * */ public static String getLocation(String lat,String lng){ String location1=""; String url="http://api.map.baidu.com/reverse_geocoding/v3/?ak=您的ak&output=xml&coordtype=wgs84ll&location="+lat+","+lng; System.out.println(url); Document doc = null; HttpURLConnection conn = null; InputStream ins = null; SAXReader reader = null; try{ //HttpTimeoutHandler hth = new HttpTimeoutHandler(600000); URL conURL = new URL(null,url); conn = (HttpURLConnection)conURL.openConnection(); conn.setDoInput(true); conn.setDoOutput(true); conn.setUseCaches(false); ins = conn.getInputStream(); reader =new SAXReader(); doc= reader.read(ins); //System.out.println(url); Element root=doc.getRootElement(); String docXmlText=doc.asXML(); //System.out.println(docXmlText); Element e=root.element("result"); Element location=e.element("formatted_address"); location1=location.asXML(); System.out.println(location1); location1=location1.substring(location1.indexOf("address>")+8,location1.indexOf("</formatted_address>")); List<Element> list = root.elements("location"); // System.out.println(url); for (Element object : list) { System.out.println(url); System.out.println(object.getName()); for (Element element : (List<Element>) object.elements()) { System.out.print(((Element) element).getName() + ":"); System.out.print(element.getText() + " "); } System.out.println(); } ins.close(); conn.disconnect(); }catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (DocumentException e) { e.printStackTrace(); }catch(Exception e){ e.printStackTrace(); }finally { try { if (ins != null) { ins.close(); ins = null; } } catch (IOException e1) { e1.printStackTrace(); } try { if (conn != null) { conn.disconnect(); conn = null; } } catch (Exception e2) { e2.printStackTrace(); } } return location1; } public static String getLocation2(String lat,String lng){ String location1=""; String url="http://api.map.baidu.com/reverse_geocoding/v3/?ak=您的ak&location="+lat+","+lng; System.out.println(url); Document doc = null; HttpURLConnection conn = null; InputStream ins = null; SAXReader reader = null; try{ //HttpTimeoutHandler hth = new HttpTimeoutHandler(600000); URL conURL = new URL(null,url); conn = (HttpURLConnection)conURL.openConnection(); conn.setDoInput(true); conn.setDoOutput(true); conn.setUseCaches(false); ins = conn.getInputStream(); reader =new SAXReader(); doc= reader.read(ins); //System.out.println(url); Element root=doc.getRootElement(); String docXmlText=doc.asXML(); //System.out.println(docXmlText); Element e=root.element("result"); Element location=e.element("addressComponent"); Element location2=location.element("adcode"); location1=location2.asXML(); System.out.println(location1); location1=location1.substring(location1.indexOf("adcode>")+7,location1.indexOf("</adcode>")); System.out.println(location1); List<Element> list = root.elements("location"); // System.out.println(url); for (Element object : list) { System.out.println(url); System.out.println(object.getName()); for (Element element : (List<Element>) object.elements()) { System.out.print(((Element) element).getName() + ":"); System.out.print(element.getText() + " "); } System.out.println(); } ins.close(); conn.disconnect(); }catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (DocumentException e) { e.printStackTrace(); }catch(Exception e){ e.printStackTrace(); }finally { try { if (ins != null) { ins.close(); ins = null; } } catch (IOException e1) { e1.printStackTrace(); } try { if (conn != null) { conn.disconnect(); conn = null; } } catch (Exception e2) { e2.printStackTrace(); } } return location1; } /* * * * * 地址你编码 * * */ public static String getlocation1(String loc){ String location2=""; String location3=""; String url="http://api.map.baidu.com/geocoding/v3/?address="+loc+"&output=xml&ak=您的ak&callback=showLocation"; System.out.println(url); Document doc = null; HttpURLConnection conn = null; InputStream ins = null; SAXReader reader = null; try{ //HttpTimeoutHandler hth = new HttpTimeoutHandler(600000); URL conURL = new URL(null,url); conn = (HttpURLConnection)conURL.openConnection(); conn.setDoInput(true); conn.setDoOutput(true); conn.setUseCaches(false); ins = conn.getInputStream(); reader =new SAXReader(); doc= reader.read(ins); //System.out.println(url); Element root=doc.getRootElement(); String docXmlText=doc.asXML(); //System.out.println(docXmlText); Element e=root.element("result"); Element location=e.element("location"); Element lng=location.element("lng"); Element lat=location.element("lat"); String lng1=lng.asXML(); String lat1=lat.asXML(); System.out.println("lng"+lng1); System.out.println("lat"+lat1); // System.out.println("location"+location.asXML()); //System.out.println("xiayukun"+e.asXML()); lng1=lng1.substring(lng1.indexOf("<lng>")+5,lng1.indexOf("</lng>")); System.out.println(lng1); lat1=lat1.substring(lat1.indexOf("<lat>")+5,lat1.indexOf("</lat>")); System.out.println(lat1); //location2=getLocation(lat1,lng1); location3=getLocation2(lat1,lng1); List<Element> list = root.elements("location"); System.out.println(url); for (Element object : list) { System.out.println(url); System.out.println(object.getName()); for (Element element : (List<Element>) object.elements()) { System.out.print(((Element) element).getName() + ":"); System.out.print(element.getText() + " "); } System.out.println(); } ins.close(); conn.disconnect(); }catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (DocumentException e) { e.printStackTrace(); }catch(Exception e){ e.printStackTrace(); }finally { try { if (ins != null) { ins.close(); ins = null; } } catch (IOException e1) { e1.printStackTrace(); } try { if (conn != null) { conn.disconnect(); conn = null; } } catch (Exception e2) { e2.printStackTrace(); } } return location3; //return location2; } public static String SELECT(String name) { String result=""; result=getlocation1(name); return result; } public static String SELECT1(String name) { String result=""; result=getlocation1(name); return result; } }
#coding:utf-8 import jieba import jieba.analyse def select(text): #第一步:分词,这里使用结巴分词全模式 fenci_text = jieba.cut(text) # print("/ ".join(fenci_text)) #第二步:去停用词 #这里是有一个文件存放要改的文章,一个文件存放停用表,然后和停用表里的词比较,一样的就删掉,最后把结果存放在一个文件中 stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ]) final = "" for word in fenci_text: if word not in stopwords: if (word != "。" and word != ",") : final = final + " " + word # print(final) #第三步:提取关键词 a=jieba.analyse.extract_tags(text, topK = 5, withWeight = True, allowPOS = ()) b=jieba.analyse.extract_tags(text, topK = 6, allowPOS = ()) # print(a) c="" for i in range(0,b.__len__()): #print(b[i]) c=c+b[i]+" " print(c[:-1]) return c[:-1] if __name__ == '__main__': select('''要分析的字段'''); #text 为待提取的文本 # topK:返回几个 TF/IDF 权重最大的关键词,默认值为20。 # withWeight:是否一并返回关键词权重值,默认值为False。 # allowPOS:仅包括指定词性的词,默认值为空,即不进行筛选。