stata PYTHON 文书筛选
cap:mkdir 1012a cd D:\te\pan\2018年裁判文书数据_马克数据网 fs *.csv local shu=4 foreach file in `r(files)'{ local shu=`shu'+1 import delimited "`file'", clear cap:keep 案件名称 所属地区 案件类型 案件类型编码 审理程序 裁判日期 案由 全文 if _rc{ foreach var of varlist _all{ local cx=`var'[1] rename `var' `cx' } } keep if 审理程序=="一审" save d:\te\xa`shu',replace } foreach var of varlist _all{ local cx=`var'[1] rename `var' `cx' } gen sheng="" local k=_N forvalues i=1/`k'{ local cc=所属地区[`i'] preserve local bz=0 local cv="" use sheng1,clear local k3=_N forvalues iii=1/`k3'{ local k33=sheng[`iii'] if regexm("`cc'","`k33'"){ disp "`cc'" local cv="`k33'" local bz=1 continue,break } } restore replace sheng="`cv' in `i' } save d:\te\xx`shu',replace } save 1012a\a1,replace forvalues iv=1/4{ use d:\te\pan\a`iv',clear // if `bz'==0{ // use 1001\quanguoshengshi,clear // local k1=_N // forvalues ii=1/`k1'{ // local k11=shi[`ii'] // local k22=xian[`ii'] // // disp "`cc'" // if regexm("`cc'","`k11'"){ // local cv="`k11'" // continue,break // } // if regexm("`ccc'","`k11'"){ // local cv="`k11'" // continue,break // } // if "`cv'"==""{ // if regexm("`cc'","`k22'"){ // local cv="`k11'" // continue,break // } // } // if "`cv'"==""{ // if regexm("`ccc'","`k22'"){ // local cv="`k11'" // continue,break // } // } // } // } restore replace shi="`cv'" in `i' disp `i' } save d:\te\pan\a`iv'x,replace }
import pandas as pd import dask.dataframe as dd import os,glob,sys directory=os.getcwd() files=glob.glob(directory+"/*") index=-1 au=0 for file in files: if ".csv" in file and "ta" in file: au=au+1 df = pd.read_csv(file) df["sheng"]="" df["shi"]="" df1 = pd.read_stata('shengx.dta') ab=1 index=-1 for dizhi in df["全文"]: index=index+1 #for index,row in df.iterrows(): ## dizhi=str(row["所属地区"]) ## dizhia=str(row["全文"]) #print(dizhi,dizhia) dizhi=str(dizhi) for shi1 in df1["sheng"]: #for index1,row1 in df1.iterrows(): ## ## shi=row1["xian"] ## shi1=row1["shi"] if shi1 in dizhi: print(file,shi1) ab=ab+1 df.loc[index,"sheng"]=shi1 break df.to_csv("ua"+str(au)+"x.csv")
import pandas as pd import dask.dataframe as dd import os,glob,sys directory=os.getcwd() files=glob.glob(directory+"/*") index=-1 au=0 for file in files: if ".csv" in file: au=au+1 df = pd.read_csv(file) df["sheng"]="" df["shi"]="" df1 = pd.read_stata('shengw.dta') ab=1 index=-1 for dizhi in df["所属地区"]: index=index+1 #for index,row in df.iterrows(): ## dizhi=str(row["所属地区"]) ## dizhia=str(row["全文"]) #print(dizhi,dizhia) dizhi=str(dizhi) for sheng,shi1 in zip(df1["sheng"],df1["shi"]): #for index1,row1 in df1.iterrows(): ## ## shi=row1["xian"] ## shi1=row1["shi"] if shi1 in dizhi: print(file,shi1) ab=ab+1 df.loc[index,"sheng"]=shi1 break df.to_csv("ta"+str(au)+"x.csv")