
最近在看mongodb,然后会用了一些最简单的mongodb的操作,然后想着结合股票信息的数据的抓取,然后将数据存储在mongodb中,对于mongo和数据库的最大的区别是,mongo不需要建表,直接进行存储,然后在选择数据表的时候在进行插入数据的时候要将str格式的字符串转换成json的格式进行插入,这个我在插入数据的时候调试了十多分钟,一直以为是自己字符串的原因,然后看了看插入数据的格式和百度,然后才发现这点。然后我是插入在本机的test.Share表中的,然后其他的注重点就没有什么了~代码写的很丑,冗余也很大,还是会继续更新~并且程序是但进程进行的数据抓取~嗯~ 很蠢~

  1 #-*-coding:utf-8 -*-
  2 import urllib
  3 import re
  4 import json
  5 import urllib2
  6 from lxml import etree
  7 import requests
  8 import time
  9 from Queue import Queue
 10 from pymongo import MongoClient
 11 import matplotlib.pyplot as plt
 12 URL = ''
 13 nation_que = Queue()
 14 client = MongoClient('localhost',27017)
 15 db = client.test
 16 Share = db.Share
 18 def sub_sort(array,array1,low,high):
 19     key = array[low]
 20     key1 = array1[low]
 21     while low < high:
 22         while low < high and array[high] >= key:
 23             high -= 1
 24         while low < high and array[high] < key:
 25             array[low] = array[high]
 26             array1[low] = array1[high]
 27             low += 1
 28             array[high] = array[low]
 29             array1[high] = array1[low]
 30     array[low] = key
 31     array1[low] = key1
 32     return low
 35 def quick_sort(array,array1,low,high):
 36      if low < high:
 37         key_index = sub_sort(array,array1,low,high)
 38         quick_sort(array,array1,low,key_index)
 39         quick_sort(array,array1,key_index+1,high)
 41 def download(url, headers, num_try=2):
 42     while num_try >0:
 43         num_try -= 1
 44         try:
 45             content = requests.get(url, headers=headers)
 46             return content.text
 48         except urllib2.URLError as e:
 49             print 'Download error', e.reason
 51     return None
 53 current_quto = Queue()
 54 open_quto = Queue()
 55 high_quto = Queue()
 56 low_quto = Queue()
 57 close_quto = Queue()
 58 update_time = Queue()
 59 def get_type_url():
 60     headers = {
 61         'User_agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 62         'Referer': '',
 63         'Cookie': 'io=-voMclEjiizK9nWKALqB; UM_distinctid=15f5938ddc72db-089cf9ba58d9e5-31657c00-fa000-15f5938ddc8b24; Hm_lvt_d25bd1db5bca2537d34deae7edca67d3=1509030420; Hm_lpvt_d25bd1db5bca2537d34deae7edca67d3=1509031023',
 64         'Accept-Language': 'zh-CN,zh;q=0.8',
 65         'Accept-Encoding': 'gzip, deflate',
 66         'Accept': '*/*'
 67     }
 68     content = download(URL,headers)
 69     html = etree.HTML(content)
 70     result = html.xpath('//a[@class="mar_name"]/@href')
 71     result1 = html.xpath('//td/text()')
 72     num = 0
 73     for each in result1:
 75         if num%6 == 0:
 76             current_quto.put(each)
 77             num += 1
 78         elif num%6 == 1:
 79             open_quto.put(each)
 80             num += 1
 81         elif num%6 == 2:
 82             high_quto.put(each)
 83             num += 1
 84         elif num%6 == 3:
 85             low_quto.put(each)
 86             num += 1
 87         elif num %6 == 4:
 88             close_quto.put(each)
 89             num +=1
 90         elif num %6 == 5:
 91             update_time.put(each)
 92             num +=1
 93     #while not
 94     for each in result:
 95         st = each.split('/')
 96         nation_que.put(st[len(st)-1])
 98     get_precent()
100 def get_precent():
102     while not nation_que.empty():
103         if not update_time.empty():
104             time_update = update_time.get(False)
105             update_time.task_done()
106         if not current_quto.empty():
107             new_rates = current_quto.get(False)
108             current_quto.task_done()
109         if not open_quto.empty():
110             opening = open_quto.get(False)
111             open_quto.task_done()
112         if not high_quto.empty():
113             high = high_quto.get(False)
114             high_quto.task_done()
115         if not low_quto.empty():
116             low = low_quto.get(False)
117             low_quto.task_done()
118         if not close_quto.empty():
119             closing = close_quto.get(False)
120             close_quto.task_done()
122         ss = nation_que.get(False)
123         print ss
124         print low
125         print high
126         print time_update
127         print new_rates
128         print opening
130         url = '' + ss +'&limit=288&resolution=5&codeType=8100&st=0.8274405615006541'
131         print url
132         headers = {'Accept':'application/json, text/javascript, */*; q=0.01',
133                 'Accept-Encoding':'gzip, deflate',
134                 'Accept-Language':'zh-CN,zh;q=0.8',
135                 'Connection':'keep-alive',
136                 'Host':'',
137                 'Origin':'',
138                 'Referer':'',
139                 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
140                }
141         num_try = 2
142         while num_try >0:
143             num_try -= 1
144             try:
145                 content = requests.get(url, headers=headers)
146                 html = json.loads(content.text)
147                 st = html['h']
148                 T_time = html['t']
149                 if  len(st) > 0 and len(T_time) > 0:
150                     draw_pict(ss,T_time,st,time_update,new_rates,opening,high,low,closing)
151                 break
152             except urllib2.URLError as e:
153                 print 'Download error', e.reason
154         nation_que.task_done()
155 List = []
156 def draw_pict(name,T_time1,high_rate,time_update,new_rate,opening,high,low,closing):
158     High = T_time1
159     Time = high_rate
160     High_Rate = []
161     T_time = []
162     mmap = "{\"Type\":\"%s\",\"Current_quto\":\"%s\",\"Opening_quto\":\"%s\",\"High_quto\":\"%s\",\"low_quto\":\"%s\",\"Closing_quto\":\"%s\",\"Update_Time\":\"%s\",\"Real_TIme_infor\":{" % (    name, new_rate, opening, high, low, closing, time_update)
163     print mmap
164     flag = 0
165     for each,high1 in zip(T_time1,high_rate):
166         if flag == 1:
167             mmap += ","
168         else:
169             flag = 1
170         mm = "\"%s\":\"%s\""%(each,high1)
173         st = time.localtime(float(each))
174         mmap += mm
175         if st.tm_min == 0:
176             T_time.append(st.tm_hour)
177             High_Rate.append(high1)
178         else:
179             pass
180     mmap += "}}"
181     mmap1 = json.loads(mmap)
182     print mmap1
183     Share.insert(mmap1)
184     if len(T_time) == len(High_Rate):
185         quick_sort(T_time,High_Rate,0,len(High_Rate)-1)
186         List.append(High_Rate)
188 def draw_picture():
189     colu = len(List)
191     num = 1
192     for each in List:
193         plt.subplot(colu/2 + 1,2,num)
194         num+=1
196         list = each
197         T_time = []
198         for i in range(len(list)):
199             T_time.append(i)
200         print len(list)
201         print len(T_time)
202         plt.plot(T_time, list, marker='*')
205     plt.title('Share Message')
207 if __name__ == '__main__':
208     get_type_url()
209     draw_picture()


