【科学有故事】做节目时的Python分析

 我爱编程,也喜欢科学,尤其是天文地理历史,从小就喜欢,抱着书看星空的图片。于是无聊时候就会在喜马拉雅里面听关于科学的音频,喜马拉雅大家应该都知道,是在2013年上线的,我2017年才知道,真是晚了些😄下面来一段喜马拉雅的介绍:

喜马拉雅组建于2012年8月,致力于在线音频分享平台的建设与运营,成为音频领域的YouTube。
旗下移动客户端“喜马拉雅 APP”于2013年3月上线,原计划首年实现1000万的用户规模,但实际上仅半年即达成千万用户目标。2014年5月初,喜马拉雅激活用户突破5000万大关。而达到同样的用户规模,SoundCloud 则足足用了六年时间。
2014年5月22日喜马拉雅宣布公司成功获得1150万美元的 A 轮风险投资,投资机构分别为 SIG(海纳亚洲)、KPCB(凯鹏华盈)、Sierra Ventures,成为中国互联网音频行业有史以来金额最大的融资个案。
在喜马拉雅勾画的产业链条里,出版社、作家、播主、粉丝、品牌将通过喜马拉雅平台联结在一起。喜马拉雅FM将助力打造由出版社电台和作家电台组成的出版社电台集群,通过粉丝效应迅速树立出版社品牌,实现经济效益的转化,通过网友打赏等方式实现变现。
2018年8月,针对改善青少年近视的指示,呵护孩子的眼睛。“喜猫儿故事”APP因此应运而生,它由国内知名的音频分享平台喜马拉雅目前正式推出的面向0-12岁儿童的故事音频平台。定位于“唯一一个平台式儿童内容APP”,旨为孩子们呈现一个故事的世界,帮助塑造孩子面向未来的视野和格局 。
2020年1月9日,胡润研究院发布《2019胡润中国500强民营企业》, 喜马拉雅FM以市值200亿元位列第367位。

于是我盯上了一个节目——汪诘的《科学有故事》,大家也可以听一听,他们是一个组织,叫科学声音,主要成员有吴京平、卓老板、汪诘、王木头、旭岽等


谈的太多,正题归来,hhhO(∩_∩)O~

有一集叫QWERTY键盘的汪诘杂谈,里面提到了Python分析,我一听就来劲,打开网址,看看,分析是这样的


不同版本打字机效率统计分析

Load Packages and Define Functions

1、

import itertools
import warnings
from collections import defaultdict
import numpy as np
import pandas as pd
import scipy.stats
import statsmodels.stats.api as sms
import math 
from scipy import stats
from scipy.stats import ttest_ind
warnings.filterwarnings("ignore")

2、

def split_freq(text):
    '''
    把文学作品中的所有文字(符号)拆分成两个连续字母(符号)为单元的 dataframe 数据集
    '''
    # split data
    data = list(text)
    A = []
    for i, j in zip(data, data[1:]):
        c = i + j
        # print(c)
        A.append(c)
    # A[:5]
    B = []
    for i, j in zip(data[1:], data[2:]):
        c = i + j
        # print(c)
        B.append(c)
    # B[:5]
    all = A + B
    # print(len(A),"+",len(B),"=",len(all))

    # get the freq count for all
    d = defaultdict(int)

    for word in all:
        d[word] += 1

    df = pd.DataFrame(d.items())
    df.set_index(0, inplace=True)

    return (df, len(all))

3、

def get_result(target, df):
    '''
    获得不同打字机(键盘)相邻字母在文字数据中出现的频次统计
    '''
    seq = df.loc[list(target["ABC"]),]
    seq.dropna(axis=0, inplace=True)
    seq = seq.reset_index()
    seq.columns = ["word_ABC", "freq_ABC"]
    data = []
    data.insert(0, {"word_ABC": "sum", "freq_ABC": sum(seq.freq_ABC)})
    seq_all = pd.concat([pd.DataFrame(data), seq], ignore_index=True)
    seq_all = seq_all.sort_values("freq_ABC", ascending=False).reset_index()

    qwerty = df.loc[list(target["QWERTY"]),]
    qwerty.dropna(axis=0, inplace=True)
    qwerty = qwerty.reset_index()
    qwerty.columns = ["word_QWERTY", "freq_QWERTY"]
    data = []
    data.insert(0, {"word_QWERTY": "sum", "freq_QWERTY": sum(qwerty.freq_QWERTY)})
    qwerty_all = pd.concat([pd.DataFrame(data), qwerty], ignore_index=True)
    qwerty_all = qwerty_all.sort_values("freq_QWERTY", ascending=False).reset_index()

    qwerty_m = df.loc[list(target["QWERTY_m"]),]
    qwerty_m.dropna(axis=0, inplace=True)
    qwerty_m = qwerty_m.reset_index()
    qwerty_m.columns = ["word_QWERTY_m", "freq_QWERTY_m"]
    data = []
    data.insert(
        0, {"word_QWERTY_m": "sum", "freq_QWERTY_m": sum(qwerty_m.freq_QWERTY_m)}
    )
    qwerty_m_all = pd.concat([pd.DataFrame(data), qwerty_m], ignore_index=True)
    qwerty_m_all = qwerty_m_all.sort_values("freq_QWERTY_m", ascending=False).reset_index()

    result = pd.concat([seq_all, qwerty_all, qwerty_m_all], axis=1, ignore_index=False)
    result = result[
        [
            "word_ABC",
            "freq_ABC",
            "word_QWERTY",
            "freq_QWERTY",
            "word_QWERTY_m",
            "freq_QWERTY_m",
        ]
    ]
    return result

4、

def get_result_lr(QWERTY_right_left,QWERTY_left_right,ABC_right_left,ABC_left_right,QWERTY_m_right_left,QWERTY_m_left_right, df):
    '''
    获得不同打字机(键盘)者左右交叉字母在文字数据中出现的频次统计
    '''
    seq = df.loc[list(ABC_right_left) + list(ABC_left_right),]
    seq.dropna(axis=0, inplace=True)
    seq = seq.reset_index()
    seq.columns = ["ABC_right_left", "freq_ABC_right_left"]
    data = []
    data.insert(0, {"ABC_right_left": "sum", "freq_ABC_right_left": sum(seq.freq_ABC_right_left)})
    seq_all = pd.concat([pd.DataFrame(data), seq], ignore_index=True)
    seq_all = seq_all.sort_values("freq_ABC_right_left", ascending=False).reset_index()
   

    qwerty = df.loc[list(QWERTY_right_left) + list(QWERTY_left_right),]
    qwerty.dropna(axis=0, inplace=True)
    qwerty = qwerty.reset_index()
    qwerty.columns = ["QWERTY_right_left", "freq_QWERTY_right_left"]
    data = []
    data.insert(0, {"QWERTY_right_left": "sum", "freq_QWERTY_right_left": sum(qwerty.freq_QWERTY_right_left)})
    qwerty_all = pd.concat([pd.DataFrame(data), qwerty], ignore_index=True)
    qwerty_all = qwerty_all.sort_values("freq_QWERTY_right_left", ascending=False).reset_index()
    
    qwerty_m = df.loc[list(QWERTY_m_right_left) + list(QWERTY_m_left_right),]
    qwerty_m.dropna(axis=0, inplace=True)
    qwerty_m = qwerty_m.reset_index()
    qwerty_m.columns = ["QWERTY_m_right_left", "freq_QWERTY_m_right_left"]
    data = []
    data.insert(0, {"QWERTY_m_right_left": "sum", "freq_QWERTY_m_right_left": sum(qwerty_m.freq_QWERTY_m_right_left)})
    qwerty_m_all = pd.concat([pd.DataFrame(data), qwerty_m], ignore_index=True)
    qwerty_m_all = qwerty_m_all.sort_values("freq_QWERTY_m_right_left", ascending=False).reset_index()
    
    result = pd.concat([seq_all, qwerty_all,qwerty_m_all], axis=1, ignore_index=False)
    result = result[
        [
            "ABC_right_left",
            "freq_ABC_right_left",
            "QWERTY_right_left",
            "freq_QWERTY_right_left",
            "QWERTY_m_right_left",
            "freq_QWERTY_m_right_left",
        ]
    ]
    return result

5、

col = [
"book No.",
"num_combinations",
"ABC%",
"QWERTY%",
"QWERTY_m%",
"QWERTY vs ABC %",
"QWERTY_m vs QWERTY %",
"QWERTY_m vs ABC %",
]

def eva(result,i,ni):
    '''
    获得每个文字数据集的各种打字机(键盘)参数百分比
    '''
    abc_per = round(result.freq_ABC[0] / n * 100, 3)
    QWERTY_per = round(result.freq_QWERTY[0] / n * 100, 3)
    QWERTY_m_per = round(result.freq_QWERTY_m[0] / n * 100, 3)

    delta = round((QWERTY_per - abc_per) / abc_per * 100, 3)
    delta2 = round((QWERTY_m_per - QWERTY_per) / QWERTY_m_per * 100, 3)
    delta3 = round((QWERTY_m_per - abc_per) / abc_per * 100, 3)

    compare_n = pd.DataFrame(
        [[i,ni, abc_per, QWERTY_per, QWERTY_m_per, delta, delta2, delta3]], columns=col
    )
    return compare_n

6、

col_lr = [
    "book No.",
    "num_combinations",
    "ABC_right_left%",
    "QWERTY_right_left%",
    "QWERTY_m_right_left%",
    "QWERTY vs ABC right left %",
    "QWERTY_m vs QWERTY %"
]

def eva_lr(result,i,ni):
    '''
    获得每个文字数据集的各种打字机(键盘)参数百分比
    '''
    ABC_right_left_per = round(result.freq_ABC_right_left[0] / n * 100, 3)
    QWERTY_right_left_per = round(result.freq_QWERTY_right_left[0] / n * 100, 3)
    QWERTY_m_right_left_per = round(result.freq_QWERTY_m_right_left[0] / n * 100, 3)

    delta = round((QWERTY_right_left_per - ABC_right_left_per) / ABC_right_left_per * 100, 3)
    delta2 = round((QWERTY_m_right_left_per - QWERTY_right_left_per) / QWERTY_right_left_per * 100, 3)
    
    compare_n = pd.DataFrame(
        [[i,ni, ABC_right_left_per, QWERTY_right_left_per, QWERTY_m_right_left_per, delta,delta2]], columns=col_lr
    )
    return compare_n

7、

def weighted_avg_and_std(data, weights):
    """
    计算加权平均和加权标准差
    """
    average = np.average(data, weights=weights)
    # Fast and numerically precise:
    variance = np.average((data-average)**2, weights=weights)
    return (average, math.sqrt(variance))


def weighted_mean_confidence_interval(data, weights, confidence=0.95):
    """
    计算加权置信区间
    """
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = weighted_avg_and_std(data, weights)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
    print("Weighted Mean: %.3f \nWeighted %0.4f Confidence Interval: [%.3f,%.3f]"% (m, confidence, m - h, m + h))

8、

def product(a, b):
    '''
    输入所有左(右)手打的字母array a 
    和所有右(左)手打的字母array b,
    输出所有可能的左右连续打字的组合,
    顺序从a到b
    '''
    c = list(itertools.product(a, b))
    QWERTY_right_left = pd.DataFrame(c).dropna()
    QWERTY_right_left = QWERTY_right_left[0] + QWERTY_right_left[1]
    return QWERTY_right_left

1. 打字机中,相邻字母(符号)组合有哪些?左右手打字的字母分别有哪些?

  • 其中 “ABC” 代表 ABCD 打字机的相邻字母的组合,“QWERTY” 代表第一版 QWE.TY 打字机相邻字母组合,“QWERTY_m” 代表现在键盘相邻字母组合。
  • “ABC_left” 代表 ABCD 打字机的所有左手打的字母,“ABC_right” 代表 ABCD 打字机的所有右手打的字母。第一版 QWE.TY 打字机亦然。
  • 把原文的大小写字母的文章,改成全小写,以进行更全面的统计。
  • 由于不同打字机字母组合数量不同,请忽略表格末尾出现的 NaN。

9、

# get the target combinations
target = pd.read_csv("target.csv")
target
ABC QWERTY QWERTY_m ABC_left ABC_right QWERTY_left QWERTY_right QWERTY_m_left QWERTY_m_right
0 -3 23 23 - r 2 7 2 7
1 35 34 34 3 s 3 8 3 8
2 57 45 45 5 t 4 9 4 9
3 79 56 56 7 u 5 - 5 0
4 9n 67 67 9 v 6 , 6 -
5 no 78 78 n w q y q y
6 op 89 89 o x w i w u
7 pq 9- 90 p y e u e i
8 qr -, 0- q z . o r o
9 rs qw qw 2 e t p t p
10 st we we 4 f z h a h
11 tu e. er 6 g s j s j
12 uv .t rt 8 h d k d k
13 vw ty ty . i f l f l
14 wx yi yu a j g m g n
15 xy iu ui b k a b z m
16 yz op io c l x n x ,
17 24 zs op d m & ? c .
18 46 sd as NaN NaN c ; v ;
19 68 df sd NaN NaN v r b :
20 8. fg df NaN NaN NaN NaN NaN NaN
21 .a hj fg NaN NaN NaN NaN NaN NaN
22 ab kl gh NaN NaN NaN NaN NaN NaN
23 bc lm hj NaN NaN NaN NaN NaN NaN
24 cd ax jk NaN NaN NaN NaN NaN NaN
25 de x& kl NaN NaN NaN NaN NaN NaN
26 ef &c l; NaN NaN NaN NaN NaN NaN
27 fg cv zx NaN NaN NaN NaN NaN NaN
28 gh bn xc NaN NaN NaN NaN NaN NaN
29 hi n? cv NaN NaN NaN NaN NaN NaN
30 ij ?; vb NaN NaN NaN NaN NaN NaN
31 jk ;R bn NaN NaN NaN NaN NaN NaN
32 kl NaN nm NaN NaN NaN NaN NaN NaN
33 lm NaN m, NaN NaN NaN NaN NaN NaN

2. 相邻字母(符号)出现的频率对比:

ABCD 打字机,第一版 QWE.TY 打字机,以及现代 QWERTY 键盘

获得整本圣经(美国标准版)数据

  • 数据来源:American Standard Version (ASV) https://bible4u.net/en/download#ASV[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lNNuY9H4-1586235386355)(attachment:image.png)]
  • 以下省略(用”#“注释掉)了单独圣经的分析结果。

10、

#load bible
text1 = open("books/Bible_ASV.txt", "r").read().lower()
df, n = split_freq(text1)
result1 = get_result(target, df)
result1.to_csv("result/Bible_ASV.csv")
#result1

11、

eva(result1,"bible",n)

获得 30 本打字机发明的年代的美国畅销书数据

book No. num_combinations ABC% QWERTY% QWERTY_m% QWERTY vs ABC % QWERTY_m vs QWERTY % QWERTY_m vs ABC %
0 bible 10405529 2.707 0.601 2.29 -77.798 73.755 -15.405
t1 = open("books/bad_boy.txt", "r").read().lower()
t2 = open("books/Blue_Jackets.txt", "r").read().lower()
t3 = open("books/From_the_Earth_to_the_Moon.txt", "r").read().lower()
t4 = open("books/Joseph_and_His_Friend.txt", "r").read().lower()
t5 = open("books/Lothair.txt", "r").read().lower()
t6 = open("books/Man_and_Wife.txt", "r").read().lower()
t7 = open("books/Memoir_of_Jane_Austen.txt", "r").read().lower()
t8 = open("books/The_Adventures_of_Harry_Richmond.txt", "r").read().lower()
t9 = open("books/The_Caged_Lion.txt", "r").read().lower()
t10 = open("books/The_Earthly_Paradis.txt", "r").read().lower()
t11 = open("books/The_Mystery_of_Edwin_Drood.txt", "r").read().lower()
t12 = open("books/The_Visionary.txt", "r").read().lower()
t13 = open("books/The_Vicar_of_Bullhampton_by_Anthony_Trollope.txt", "r").read().lower()
t14 = open("books/The_Wild_Garden.txt", "r").read().lower()
t15 = open("books/Twenty_Thousand_Leagues_under_the_Sea_by_Jules_Verne.txt", "r").read().lower()
t16 = open("books/Venus_in_Furs_by_Ritter_von_Leopold_Sacher-Masoch.txt", "r").read().lower()
t17 = open("books/The_Adventures_of_Tom_Sawyer.txt", "r").read().lower()
t18 = open("books/Atthe_Back_of_the_North_Wind.txt", "r").read().lower()
t19 = open("books/Coles_Funny_Picture_Book.txt", "r").read().lower()
t20 = open("books/The_Cuckoo_Clock.txt", "r").read().lower()
t21 = open("books/The_Lost_Princess.txt", "r").read().lower()
t22 = open("books/Mildred_Keith.txt", "r").read().lower()
t23 = open("books/The_Princess_and_the_Goblin.txt", "r").read().lower()
t24 = open("books/What_Katy_Did.txt", "r").read().lower()
t25 = open("books/Under_the_Window.txt", "r").read().lower()
t26 = open("books/Carmilla.txt", "r").read().lower()
t27 = open("books/Erewhon.txt", "r").read().lower()
t28 = open("books/Daisy_Miller.txt", "r").read().lower()
t29 = open("books/Leavenworth.txt", "r").read().lower()
t30 = open("books/Rosein_Bloom.txt", "r").read().lower()

13、

text2 = (
    t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9 + t10 + t11
    + t12 + t13 + t14 + t15 + t16 + t17 + t18 + t19 + t20 
    + t21 + t22 + t23 + t24 + t25 + t26 + t27 + t28 + t29 + t30
)

14、

df, n = split_freq(text2)
result2 = get_result(target, df)
result2.to_csv("result/books30.csv")
#result2

15、

eva(result2,"2-30",n)

以下省略,全文请点击链接http://www.kexueshengyin.com/typerAnalysis.html,我就是转载一下,说实话只能看懂百分之二😢

仰慕科学声音!

posted @ 2020-04-07 13:02  Aeterna_Gungnir  阅读(320)  评论(0编辑  收藏  举报