数据获取+数据分析

爬虫+数据分析:参见MOOC嵩天老师课程+构建User-Agnet代理池 (biancheng.net)

pip安装失败怎么办?直接下载tar.gz格式安装包,然后放到当前文件夹下,再

pip install ./xxxxx/tar/gz

 

matplotlib篇:

画个天气折线图:

from matplotlib import pyplot as plt
import random

plt.figure(figsize=(20,8),dpi=80)

x=range(120)
y=[random.randint(20,35) for i in range(120)]


plt.plot(x,y)
plt.xticks(x[::2])
plt.yticks(y)

plt.show()

船新版本:

from matplotlib import pyplot as plt
import random
from matplotlib import font_manager
x=range(120)
_x=list(x)
y=[random.randint(20,35) for i in range(120)]
time=["time is 10:{}".format(i) for i in range(0, 60)]
time += ["time is 11:{}".format(i) for i in range(0, 60)]
plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y)
plt.xticks(_x[::10], time[::10],rotation=45)
plt.xlabel("time")
plt.ylabel("temperature")
plt.title("time and temperature")
plt.show()

 11到30岁交男女朋友的数量折线图:

from matplotlib import pyplot as plt
import random
x=range(11,31)
y=[random.randint(0,5) for i in range(11,31)]
plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y)
plt.xticks(x,["age:{}th".format(i) for i in x],rotation=45)
plt.ylabel("how many objects")
plt.title("what fuck!?!?!?!?")
plt.show()

 两个人的折线图:

from matplotlib import pyplot as plt
import random
x=range(11,31)
y=[random.randint(0,5) for i in range(11,31)]

a=range(11,31)
b=[random.randint(0,5) for i in range(11,31)]

plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y,label="me",color="orange",linestyle='--')
plt.plot(a,b,label="another",color="cyan",linestyle=':')
plt.xticks(x,["age:{}th".format(i) for i in x],rotation=45)
plt.ylabel("how many objects")
plt.title("what fuck!?!?!?!?")
#网格grid
plt.grid(alpha=0.8,linestyle=':')#alpha就是透明度
#标签
plt.legend()
plt.show()

 numpy篇:

numpy初探:

import random

import numpy as np
t1 = np.array([1,2,3])
print(t1)
print(type(t1))

t2 = np.array(range(10),dtype='i2')
print(t2)
print(type(t2))
print(t2.dtype)

t3 = t2.astype(dtype='i1')
print(t3)
print(t3.dtype)

t4 = np.arange(10)
print(t4)


t5 = np.array( [round(random.random() , 3) for i in range(10)] )
print(t5)

进阶:

import random
import numpy as np

t1 = np.array([[1,2,3],[4,5,6]])
print(t1.shape)

t1.reshape(3,2)
print(t1)
print(t1.shape)

print(t1.reshape(3,2))#这种有return的函数,一般不对原数组做改变,extend等方法是例外
print(t1.reshape(3,2).shape)

print(t1.shape[0] * t1.shape[1])
print(t1.flatten())#转换成一维
print(t1.flatten().shape[0])

print(t1+2)#广播机制
print(t1*2)

print(t1 + t1)

t2 = np.array([[[1,2,3],[4,5,6],[7,8,9]],
               [[1,2,3],[4,5,6],[7,8,9]],
               [[1,2,3],[4,5,6],[7,8,9]]])
t3 = np.array([[0,1,2],[3,4,5],[7,8,9]])
print(t2 - t3)

t2 = np.array([[[1,2],[4,5],[7,8]],
               [[1,2],[4,5],[7,8]],
               [[1,2],[4,5],[7,8]]])
t3 = np.array([[0],[1],[2]])
print("此之谓广播机制,尾号相同或者为1即可运算")
print(t2.shape)
print(t3.shape)
print(t2-t3)

索引与切片:

a[1, 2]  # 1行2列
a[:, 2]  # 所有行第2列
a[1:3, 2]  # 第1行,第2行的第2列,注意左闭右开
a[[5, 7, 8], 6:9]  # 第5,7,8行的6,7,8列
a[[0, 0], [2, 1]]  # 坐标[0,2],[0,1]的值
a[[1:5:2],:]#1行到4行,步长2,所有列

数组修改:

#a[a>20]=100#矩阵中大于20的直接变成100,这个叫布尔索引
import numpy as np
t=np.array([18,5,10])
t = np.where(t<10,0,10)#np的三元运算符,t中<10的变0,其余变10,注意,这是有返回值的类型
print(t)
t=np.array([18,5,10])
t = t.clip(8,12)#裁剪操作,低的拔高,高的减低,中间不变
print(t)

数组拼接:

np.vstack((t1,t2))#vertically竖直拼接
np.hstack((t1,t2))#horizontally水平拼接

数组交换:

t[[1,3],:]=t[[3,1],:]#不使用第三个变量

构造全1全0的数组:

np.zeros((5,5))
np.ones((5,5))

更多用法:

import numpy as np
t = np.eye(4)#创造一个单位矩阵
print(np.argmax(t,axis=0))#找到某一维度上最大或者最小的数值的位置
print(np.argmin(t,axis=1))

随机数大选:

 

 

import numpy as np
np.random.seed(10)#随机种子
t = np.random.randint(10,20,(5,5))
print(t)
t = np.random.rand(3,3)
print(t)

注意:

a,b相互影响
a=b
a=b[:]
a,b相互不影响
a=b.copy()

 奇奇怪怪的nan,inf(无穷大infinity),-inf(负无穷)

import numpy as np
t=np.array([[1,2,3],[4,5,np.nan]])
print(np.sum(t,axis=0))#sum函数求和,nan和任何数字运算都是nan
print(np.sum(t))

常见数字特征:

import numpy as np
t=np.array([[1,2,3],[4,5,np.nan]])
print(np.mean(t,axis=0))
print(np.median(t,axis=0))
print(np.std(t,axis=0))

 pandas篇:

 

基于numpy开发的,可以处理其他数据类型的工具:

seires一维工具:

import pandas as pd
t = pd.Series({"age":30,"name":"ark","tel":10086})
print(t)
t = pd.Series([1,2,3,4,5],index=['a','b','c','d','e'])
print(t)
print(t*2)
print(t[1:])
print(t[[1,4]])
print(t[t>3])
print(t.index)
print(t.values)

dateframe二维:

import numpy as np
import pandas as pd
import string
t = pd.DataFrame(np.arange(12).reshape((3,4)),index=list(('a','b','c')),columns=list(string.ascii_uppercase[0:4]))
print(t)#横轴是axis=0,纵轴column是axis=1
#用ndarray创建
t = pd.DataFrame({"name":["xiaoming","xiaoma","xiaohei"],"age":["15","16","18"]})#一个中括号代表一列数据
print(t)
#用列表创建
t = pd.DataFrame([["xiao1",1],["xiao2",2],["xiao3",3]],columns=["name","number"])#3个列表表示三行数据,column索引另算
print(t)
#用字典创建
t = pd.DataFrame([{"name":"xiaoma","age":18},{"age":15}])#一个中括号表示一个横行
print(t)
print(t.loc[[0,1]])
data = {"name":["xiaoma","xiaohong"],"tel":["10086","10010"]}#在pandas内部,list格式首先纵着排列,dictionary的key横着排列
t = pd.DataFrame(data,index=["husband","wife"])
print(t)

显然list对应row的内容,dictionary对应column的内容。 

pandas可用于提取和储存CSV(comma seperated value逗号分隔符),JSON(javascript object notation,js对象表示法)格式的文本(具体见菜鸟教程)

pandas可以用于数据清洗:比如处理数据的格式,清除重复的数据等等。

 

posted @ 2022-08-10 16:20  _a_rk  阅读(74)  评论(0编辑  收藏  举报
浏览器标题切换
浏览器标题切换end