sql 查询练习之《西游记》字频词频统计
任务:西游记字频统计,词频统计。
工具:sql sever2012、图悦。
数据:西游记.txt
结果:
sql代码:
-- 字符统计 DECLARE @len int,@i int; DECLARE @str nchar(1000); -- 用临时表#array建立统计数组 if object_id('tempdb.dbo.#array') is not null drop table #array if object_id('tempdb.dbo.#hanzi') is not null drop table #hanzi create table #array(ch nchar(1) PRIMARY key,cnt int) -- 将系统表中的汉字提出到#hanzi表,并插入到临时表#array,nchar(19968)为第一个汉字,共20902个汉字。 SELECT top 20902 code=identity(int,19968,1) into #hanzi from syscolumns a,syscolumns b INSERT into #array SELECT nchar(code),0 from #hanzi drop table #hanzi -- 开始统计 declare cur scroll cursor for SELECT contents from xiyouji open cur fetch first fetch from cur into @str while @@fetch_status = 0 begin set @len = length(@str) set @i = 0 while @i<@len begin set @i = @i + 1 update #array set cnt=cnt+1 where ch=SUBSTRing(@str,@i,1) end fetch next from cur into @str end close cur DEALLOCATE cur SELECT top 100 * from #array ORDER BY -cnt drop TABLE #array -- 词语统计 -- 先对contents去标点。 SELECT REPLACE(replace(contents,',',''),'。','') from xiyouji -- 先不在#array表放内容,直接从contents内容查询,出现一个词插一个词。 if(select ch from #array where ch=SUBSTRing(@str,@i,2) is null) insert into #array values(substring(@str,@i,2),1) else UPDATE #array set cnt = cnt +1 WHERE ch = SUBSTRing(@str,@i,2)