11.24每日总结
今天完成了大数据的测试,
这一次没有用hadoop用的
是python直接对数据的导入、
清洗、分析和可视化展示的操作,下面是所有的代码。
import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error import pandas as pd import numpy as np class LinearRegressionModel(nn.Module): def __init__(self, input_size): super(LinearRegressionModel, self).__init__() self.linear = nn.Linear(input_size, 1) def forward(self, x): return self.linear(x) def dump_load_demo(): # 1. 获取数据 df = pd.read_csv('C:\\Users\\admin\\Desktop\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\data\\tmdb_1000_predict.csv', sep=',', engine='python', header=[0]) feature_arr = df[['budget', 'popularity', 'revenue', 'runtime']].values tag_arr = df['vote_count'].values print(len(tag_arr)) # 2. 数据基本处理 # 2.1 数据集划分 x_train, x_test, y_train, y_test = train_test_split(feature_arr, tag_arr[:, np.newaxis], random_state=350, test_size=0.2) # 3. 特征工程 --标准化 transfer = StandardScaler() x_train = transfer.fit_transform(x_train) x_test = transfer.fit_transform(x_test) # 4. 机器学习(线性回归) # 4.1 模型训练 input_size = x_train.shape[1] model = LinearRegressionModel(input_size) criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.01) x_train_tensor = torch.FloatTensor(x_train) y_train_tensor = torch.FloatTensor(y_train) x_test_tensor = torch.FloatTensor(x_test) # 转换为 PyTorch 的 DataLoader train_dataset = TensorDataset(x_train_tensor, y_train_tensor) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) num_epochs = 100 for epoch in range(num_epochs): for inputs, labels in train_loader: optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() print("这个模型的偏置是:\n", model.linear.bias.item()) import os if not os.path.exists("save"): os.makedirs("save") torch.save(model.state_dict(), "save/test.pth") # 4.2 模型保存 torch.save(model.state_dict(), "save/test.pth") # 4.3 模型加载 loaded_model = LinearRegressionModel(input_size) loaded_model.load_state_dict(torch.load("save/test.pth")) # 5. 模型评估 # 5.1 预测值和准确率 with torch.no_grad(): y_pre = loaded_model(x_test_tensor).numpy() print("预测值是:\n", y_pre) import pandas as pd from sklearn.metrics import mean_squared_error # 假设y_test和y_pre是两个包含NaN值的DataFrame y_test = y_test.dropna() y_pre = y_pre.dropna() ret = mean_squared_error(y_test, y_pre) ret = mean_squared_error(y_test, y_pre) print("均方误差是:\n", ret) if __name__ == '__main__': dump_load_demo()