0803/代码
数据处理:
import csv from csv import writer with open('july.txt', 'r', encoding='utf-8') as file: for line in file: result=["".join(line).strip('\n')] print(result) result2=result[0].split(',') print(result2) id_time=result2[3] print(id_time) with open("{}.csv".format(id_time), "a", newline="") as f_object: writer_object = writer(f_object) writer_object.writerow(result2) import pandas as pd for i in range(6,19): csv_file=pd.read_csv('{}.csv'.format(i), usecols = ['OUT','SUM1','IN','SUM2']) grouped1 = csv_file.groupby('OUT')['SUM1'].sum() grouped2 = csv_file.groupby('IN')['SUM2'].sum() grouped_OUT=grouped1.reset_index() grouped_OUT.to_csv('{}_OUT.csv'.format(i),index=False) grouped_IN=grouped2.reset_index() grouped_IN.to_csv('{}_IN.csv'.format(i),index=False)
任务1-AUTO-ARIMA:
import numpy as np from pmdarima import auto_arima import csv from csv import writer import time import pandas as pd csv_file=open('节点IN.csv') csv_reader_lines = csv.reader(csv_file) for line in csv_reader_lines: print(line) data = np.asarray(line[1:14]) print(data) print(pd.isnull(data)) try: model1=auto_arima(data,start_p=1,start_q=1,max_p=3,max_q=3,start_P=0,seasonal=False,trace = True,error_action ='ignore',suppress_warnings = True,stepwise=True) model1.fit(data) forecast, conf_int = model1.predict(n_periods=1, return_conf_int=True) forecast=forecast/0.1 print(forecast) print(conf_int) list_data=[line[0],forecast.tolist()[0],conf_int[0][0]/0.1,conf_int[0][1]/0.1] print(list_data) with open("节点IN_PRED.csv", "a", newline="") as f_object: writer_object = writer(f_object) writer_object.writerow(list_data) except: with open("LOG.csv", "a", newline="") as f2_object: writer_object = writer(f2_object) writer_object.writerow(line) pass
任务2:SVM/GBDT/RF/KN
import numpy as np import csv from csv import writer import time import pandas as pd from sklearn import datasets, svm, metrics from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.svm import SVR from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import GridSearchCV csv_file=open('节点IN.csv') csv_reader_lines = csv.reader(csv_file) for line in csv_reader_lines: line_data=[ float(x) for x in line[1:14] ] X_origin = [6,7,8,9,10,11,12,13,14,15,16,17,18] X = np.asarray(X_origin).reshape(-1, 1) y = np.asarray(line_data) X_train, X_test, y_train, y_test=X,X,y,y #SVM model = SVR(kernel='rbf',gamma='scale', C=10, epsilon = 0.05) model.fit(X_train, y_train) #GBDR model2 = GradientBoostingRegressor() model2.fit(X_train, y_train) #RF model3 = RandomForestRegressor() model3.fit(X_train, y_train) #KNR model4 = KNeighborsRegressor() model4.fit(X_train, y_train) y_pred = model.predict(X_test) print(y_pred) y_pred2 = model2.predict(X_test) print(y_pred2) y_pred3 = model3.predict(X_test) print(y_pred3) y_pred4 = model4.predict(X_test) print(y_pred4) R2 = metrics.r2_score(y_test, y_pred) R2_2 = metrics.r2_score(y_test, y_pred2) R2_3 = metrics.r2_score(y_test, y_pred3) R2_4 = metrics.r2_score(y_test, y_pred4) print(R2) print(R2_2) print(R2_3) print(R2_4) MAE = metrics.mean_absolute_error(y_test, y_pred) MAE_2 = metrics.mean_absolute_error(y_test, y_pred2) MAE_3 = metrics.mean_absolute_error(y_test, y_pred3) MAE_4 = metrics.mean_absolute_error(y_test, y_pred4) print(MAE) print(MAE_2) print(MAE_3) print(MAE_4) list_data=[R2,R2_2,R2_3,R2_4,MAE,MAE_2,MAE_3,MAE_4,y_pred,y_pred2,y_pred3,y_pred4] with open("节点_NH.csv", "a", newline="") as f_object: writer_object = writer(f_object) writer_object.writerow(list_data)
任务3:sklearnRF
# coding:utf-8 import sklearn from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression import pandas as pd import chardet import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt from sklearn import preprocessing import csv from csv import writer from sklearn import metrics import numpy as np import seaborn as sns from sklearn.inspection import partial_dependence from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.model_selection import cross_val_score from sklearn.preprocessing import OneHotEncoder from matplotlib.pyplot import MultipleLocator from sklearn.metrics import r2_score import time from sklearn.model_selection import GridSearchCV #构造函数 def main_process(NAME,i,nestimator,maxdepth): pd_data=pd.read_csv(NAME,low_memory=False) X_origin = pd_data.loc[:, ('STATION','DIS_CEN','ROAD','CANYIN','FENGJING','GONGSI','GOUWU','KEJIAO','ZHUZHAI','SHENGHUO','YILIAO','BTSM','WATER','POI','RENKOU','JULI','OUT_HOUR{}'.format(i),'IN_HOUR{}'.format(i))] y_origin = pd_data.loc[:, 'HOUR{}'.format(i)] X_test, y_test = sklearn.utils.shuffle(X_origin, y_origin) X, y = X_origin, y_origin #定义训练模型 rfreg = RandomForestRegressor(n_estimators=nestimator, max_depth=maxdepth, random_state=90) #训练 model = rfreg.fit(X, y) print('模型特征重要性:') print(rfreg.feature_importances_) y_pred = rfreg.predict(X.values) np.savetxt('{}.txt'.format(i),y_pred) #直接返回十次交叉验证值,此处返回负的MSE,转为正值 cross_score=cross_val_score(rfreg, X_test, y_test, cv=5,scoring = 'r2').mean() print('R2:',cross_score) cross_score2=cross_val_score(rfreg, X_test, y_test, cv=5,scoring = 'neg_mean_absolute_error').mean() print('MAE:',-cross_score2) cross_score3=cross_val_score(rfreg, X_test, y_test, cv=5,scoring = 'explained_variance').mean() print('EV:',cross_score3) list_data_1=[cross_score, -cross_score2, cross_score3] with open("节点_参数.csv", "a", newline="") as f_object: writer_object = writer(f_object) writer_object.writerow(list_data_1) # 做ROC曲线 plt.ion() plt.figure(figsize=(24,8)) plt.plot(range(len(y_pred)), y_pred, 'r', label="predict") plt.plot(range(len(y_pred)), y, 'b', label="test", alpha=0.5) plt.legend(loc="upper center") # 显示图中的标签 plt.xlabel("ID") plt.ylabel('流量') plt.savefig("{}.png".format(i)) plt.show() plt.close() time.sleep(5) #绘制部分依赖图,输入特征 features = ['STATION','DIS_CEN','ROAD','CANYIN','FENGJING','GONGSI','GOUWU','KEJIAO','ZHUZHAI','SHENGHUO','YILIAO','BTSM','WATER','POI','RENKOU','JULI','OUT_HOUR{}'.format(i),'IN_HOUR{}'.format(i)] sklearn.inspection.plot_partial_dependence(rfreg, X, features,n_jobs=3, grid_resolution=20, method='brute')#,line_kw={"color": "black","lw":0.8},line_kw是传给plot的关键字字典 plt.ion()#动态图自动关闭 plt.gcf().set_size_inches(12, 24) plt.gcf() plt.savefig("{}_1.png".format(i)) plt.show() time.sleep(5) plt.close() #自动参数寻优 def grid_search(NAME,i_1): pd_data=pd.read_csv(NAME,low_memory=False) X_origin = pd_data.loc[:, ('STATION','DIS_CEN','ROAD','CANYIN','FENGJING','GONGSI','GOUWU','KEJIAO','ZHUZHAI','SHENGHUO','YILIAO','BTSM','WATER','POI','RENKOU','JULI','OUT_HOUR{}'.format(i),'IN_HOUR{}'.format(i))] y_origin = pd_data.loc[:, 'HOUR{}'.format(i_1)] X_origin, y_origin = sklearn.utils.shuffle(X_origin, y_origin) X, y = X_origin[0:5000], y_origin[0:5000] rfreg_ = RandomForestRegressor(random_state=90) #寻优参数范围 param_grid ={'n_estimators' : range(1,400,50), 'max_depth' : range(1,20,3), } rf_best = GridSearchCV(rfreg_, param_grid = param_grid, cv = 5,scoring='neg_mean_absolute_error') rf_best.fit(X, y) #打印最优参数 print(rf_best.best_params_) #返回最优参数 return rf_best.best_params_['n_estimators'],rf_best.best_params_['max_depth'] path='线_ALL_origin.csv' for i_real in range(6,19): # best=grid_search(path,i_real) nestimator_ = 200 maxdepth_ = 10 list_data_2=[nestimator_,maxdepth_] with open("节点_树参数.csv", "a", newline="") as f_object: writer_object = writer(f_object) writer_object.writerow(list_data_2) main_process(path,i_real,nestimator_,maxdepth_)
网络图像绘制:
import networkx as nx import matplotlib.pyplot as plt import csv from csv import reader import numpy as np from jenkspy import jenks_breaks #构造函数 def networkxG(NAME1,NAME2,i): #初始化网络 G = nx.Graph() #点位置pos pos={} #向网络添加点 with open(NAME1, "r") as my_file: file_reader = reader(my_file) for line in file_reader: id=line[0] lon=line[1] lat=line[2] G.add_nodes_from([(id)]) pos[id]=(float(lon)*10,float(lat)*10) #向网络添加边 jen_list=[] with open(NAME2, "r") as my_file2: file_reader2 = reader(my_file2) for line in file_reader2: OUT=line[0] IN=line[1] wei=line[i-4] G.add_edge(OUT, IN, weight=float(wei)) jen_list.append(float(wei)) #分权重保存边到列表中,以便分别绘制,此处分两层 edges = G.edges() G1=[] G2=[] G3=[] G4=[] #计算自然间断点分级 # breaks=jenks_breaks(jen_list, n_classes=2) # print(breaks) for u,v in edges: a=G[u][v] b=a['weight'] if b <= 10: G1.append((u,v)) elif 10 < b <=50: G2.append((u,v)) elif 50 < b <=100: G3.append((u,v)) else: G4.append((u,v)) #nx.draw_networkx_nodes(G, pos, node_size=0.1) plt.ion() plt.figure(dpi=100) #分别绘制权重低和高的边,以便权重高的边在上层 nx.draw_networkx_edges(G, pos, edgelist=G1, width=0.1 ,alpha=0.1, edge_color='peachpuff') nx.draw_networkx_edges(G, pos, edgelist=G2, width=0.2 ,alpha=0.2, edge_color='lightsalmon') nx.draw_networkx_edges(G, pos, edgelist=G3, width=0.2 ,alpha=0.3, edge_color='tomato') nx.draw_networkx_edges(G, pos, edgelist=G4, width=0.2 ,alpha=0.5, edge_color='indianred') plt.show() plt.savefig("{}_1.png".format(i)) plt.close() node_path='点.csv' edge_path='OD_PRED.csv' for i in range(6,19): networkxG(node_path,edge_path,i)
图像转gif:
import imageio with imageio.get_writer(uri='节点_IN.gif', mode='I', fps=1) as writer: for i in range(15,19): writer.append_data(imageio.imread(f'{i+1}.jpg'))