get_data_use_notbom 自定义外部数据自动写入

import urllib.request;
from pandas import DataFrame;
from pandas import Series;
from bs4 import BeautifulSoup;

import pandas as pd
import chardet

file_name = "2222-11.txt"
#file_name = "2222.txt"
file_path = 'file:///F:/python/untitled1/core/do_data/save2/'

response = urllib.request.urlopen(file_path + file_name)
html = response.read();
#result = chardet.detect(html) # 检测文件内容
#print(result)
#print(html)

soup = BeautifulSoup(html,"html.parser")
trs = soup.find_all('tr')
ths = trs[0].find_all('th');

index_d = []
for th in ths:
    #print(th.getText)

    index_d.append(th.getText())
data = DataFrame(columns=index_d)
print(index_d)

for tr in trs :
    tds = tr.find_all('td')
    td_datas = []
    for td in tds:
        td_datas.append(td.getText())
    if len(td_datas) != 0:
        data=data.append(
            Series(
                td_datas,
                index=index_d
            ), ignore_index=True
        )

print(len(data))

str2s = []

for i in range(len(data["股票全码"])):

    str2 = data["涨停时间"][i] +" "+ data["历史涨停原因"][i] +" "+ data["涨停选原因"][i]
    str2s.append(str2)

data["new"] = str2s
data=data.drop_duplicates(subset=['股票代码'],keep='last',inplace=False)
print(len(data))


ofile = "extern_user.txt"


def gb_trans_utf8(file_path):
    with open(file_path, 'r', encoding='gb18030') as f:
        content = f.read()
    #print(content)
    with open("utf"+file_path, 'w', encoding='utf-8') as f:
        f.write(content)

def utf8_trans_gb(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    #print(content)
    with open(file_path, 'w', encoding='gb18030') as f:
        f.write(content)

gb_trans_utf8(ofile)


new_data = pd.read_table("utf"+ofile,header= None,sep="|",encoding="utf-8",dtype=str)

new_data = new_data.iloc[:,0:4]
new_data = new_data.dropna()

new_data.columns.name = ["a","b","c","d"]
new_data.columns = ["a","b","c","d"]
data = data.reset_index(drop=True)
#data = data.reindex(range(len(data)))
#print(data.iloc[:])
for i in range(len(data)):
#for i in range(10):
    #print(i)
    #print(data.loc[i,"股票代码"])
    #print("haham")

    d_code = str(data.loc[i,"股票代码"])
    #new_data.loc[((new_data["b"] == d_code) &  (new_data["c"] == "31")),"d"]=data.loc[i,"new"]

    flag = ''
    if d_code[0] == "6":
        flag = "1"
    else:
        flag = "0"
    row=[flag,d_code,"31",data.loc[i,"new"]]
    #print(i)
    print(row)
    #print(new_data.iloc[:])

    new_data = new_data.append(
            Series(
                row,
                index=new_data.columns
            ), ignore_index = True
    )
    #print("haha")


new_data = new_data.drop_duplicates(subset=["b","c"],keep='last',inplace=False)

new_data["c"]=new_data["c"].astype(int)
new_data = new_data.sort_values(by=["c","b"] , ascending=(True,True))

print(new_data.columns)
new_data=new_data.reset_index(drop=True)

new_data["e"]="0.00"


new_data.to_csv('save/extern_user.txt', sep='|', index=False,header=None,)

utf8_trans_gb('save/extern_user.txt')

 

posted @ 2020-05-30 10:20  rongye  阅读(157)  评论(0编辑  收藏  举报