sql 查询练习之《西游记》字频词频统计

任务:西游记字频统计,词频统计。

工具:sql sever2012、图悦。

数据:西游记.txt

结果:

sql代码:

-- 字符统计
DECLARE @len int,@i int;
DECLARE @str nchar(1000);

-- 用临时表#array建立统计数组
if object_id('tempdb.dbo.#array') is not null
    drop table #array
if object_id('tempdb.dbo.#hanzi') is not null
    drop table #hanzi
    
create table #array(ch nchar(1) PRIMARY key,cnt int)

-- 将系统表中的汉字提出到#hanzi表,并插入到临时表#array,nchar(19968)为第一个汉字,共20902个汉字。
SELECT top 20902 code=identity(int,19968,1) into #hanzi 
from syscolumns a,syscolumns b

INSERT into #array 
    SELECT nchar(code),0 from #hanzi
    
drop table #hanzi

-- 开始统计
declare cur scroll cursor for
    SELECT contents from xiyouji

open cur
fetch first fetch from cur into @str
while @@fetch_status = 0
begin
    set @len = length(@str)
    set @i = 0
    while @i<@len
    begin
        set @i = @i + 1
        update #array set cnt=cnt+1 where ch=SUBSTRing(@str,@i,1)
    end
    fetch next from cur into @str
end
close cur
DEALLOCATE cur

SELECT top 100 * from #array ORDER BY -cnt
drop TABLE #array

-- 词语统计
-- 先对contents去标点。
SELECT REPLACE(replace(contents,',',''),'','') from xiyouji
-- 先不在#array表放内容,直接从contents内容查询,出现一个词插一个词。
if(select ch from #array where ch=SUBSTRing(@str,@i,2) is null)
    insert into #array values(substring(@str,@i,2),1)
else
    UPDATE #array set cnt = cnt +1 WHERE ch = SUBSTRing(@str,@i,2)

 

posted @ 2020-07-20 10:01  unuliha  阅读(1123)  评论(0编辑  收藏  举报