_io.TextIOWrapper

'''
SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2;


 select   top   y   *   from   表   where   主键   not   in(select   top   (x-1)*y   主键   from   表)



  如果表中无主键,可以用临时表,加标识字段解决.这里的x,y可以用变量.

  select   id=identity(int,1,1),*     into   #tb   from   表
  select   *   from   #tb   where   id   between   (x-1)*y   and   x*y-1




 select   top   1000   Info_ID   from   Info_Roles
 select   top   2000   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where   Info_ID   not   in( select   top   1000   Info_ID   from   Info_Roles   )   ;
 select   top   399   Info_ID,',xiaole20180410SPLIT,',UPPER(content)   from   Info_Content      ;
 select   top   399   CHARINDEX('IMG',UPPER(content))   from   Info_Content      ;
 select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where  CHARINDEX('IMG',UPPER(content))>0;
 select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where
 Info_ID      in( select   top   1000   Info_ID   from   Info_Roles   )  and
  CHARINDEX('IMG',UPPER(content))>0
 ;



SELECT
	TOP 15 Info_ID,
	',xiaole20180410SPLIT,',
	content
FROM
	Info_Content
WHERE
	Info_ID IN (
		SELECT
			TOP 1000 Info_ID
		FROM
			Info_Roles
		WHERE
			Flag = 1
	)
AND CHARINDEX('IMG', UPPER(content)) > 0;





SELECT
	TOP 200 Info_ID,
	',xiaole20180410SPLIT,',
	content
FROM
	Info_Content
WHERE
	Info_ID IN (
		SELECT
			TOP 90000 Info_ID
		FROM
			Info_Roles
	)
AND CHARINDEX('<IMG', UPPER(content)) > 0;



'''

from bs4 import BeautifulSoup
from selenium import webdriver

xlsplit_str = ',xiaole20180410SPLIT,'
xlsplit_str = ',xiaole20180410SPLIT,'
f_db_txt, uid_d = 'db.uid.para.txt', {}
f_db_txt, uid_d = 'db.uid.para.byhand.txt', {}
uid_ = 0
# uid = '{}{}'.format('byhand', uid_)
# uid_d[uid]={}
with open(f_db_txt, 'r', encoding='utf-8') as fr:
    for i in fr:
        i = i.replace('\t', '').replace('\n', '')
        if xlsplit_str in i:
            l = i.split(xlsplit_str)
            #   uid = l[0].replace(' ', '')
            #  uid = l[0].replace(' ', '')
            uid_ += 1
            uid = '{}{}'.format('byhand', uid_)
            uid_d[uid] = {}
            # uid_d[uid]['html'] = []
            # uid_d[uid]['html'].append(l[1])
            uid_d[uid]['html'] = l[1]
        else:
            #  uid_d[uid]['html'].append(i)
            uid_d[uid]['html'] = '{}{}'.format(uid_d[uid]['html'], i)

r_d = {}

'''
中文分句
'''
cutlist = ['。', '；', '？', '.', ';', '?', '...', '、、、', '：', ':', '，', ',']


# 检查某字符是否分句标志符号的函数；如果是，返回True，否则返回False
def FindToken(cutlist, char):
    if char in cutlist:
        return True
    else:
        return False


# 进行分句的核心函数
def Cut(cutlist, lines):  # 参数1：引用分句标志符；参数2：被分句的文本，为一行中文字符
    l = []  # 句子列表，用于存储单个分句成功后的整句内容，为函数的返回值
    line = []  # 临时列表，用于存储捕获到分句标志符之前的每个字符，一旦发现分句符号后，就会将其内容全部赋给l，然后就会被清空

    for i in lines:  # 对函数参数2中的每一字符逐个进行检查 （本函数中，如果将if和else对换一下位置，会更好懂）
        if FindToken(cutlist, i):  # 如果当前字符是分句符号
            line.append(i)  # 将此字符放入临时列表中
            l.append(''.join(line))  # 并把当前临时列表的内容加入到句子列表中
            line = []  # 将符号列表清空，以便下次分句使用
        else:  # 如果当前字符不是分句符号，则将该字符直接放入临时列表中
            line.append(i)
    return l


'''

'''


def paragraph_to_sentence(paragraph, sentence_l):
    paragraph = paragraph.replace(' ', '')
    sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
    for i in sentence_split_l:
        ll = paragraph.split(i)
        sentence_l.append(ll[0])
        if len(ll) > 1:
            paragraph_to_sentence(ll[1], sentence_l)
        else:
            break

    return sentence_l


def paragraph_to_sentence_no_recursion(paragraph, sentence_l):
    paragraph = paragraph.replace(' ', '')
    sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
    for i in sentence_split_l:
        ll = paragraph.split(i)
        sentence_l.append(ll[0])
        if len(ll) > 1:
            paragraph_to_sentence(ll[1], sentence_l)
        else:
            break

    return sentence_l


paragraph = ''
sentence_l = []
paragraph = paragraph.replace(' ', '')
sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
for i in sentence_split_l:
    ll = paragraph.split(i)
    sentence_l.append(ll[0])
    if len(ll) > 1:
        paragraph_to_sentence(ll[1], sentence_l)
    else:
        break


def sentence_l_to_sentence_l_l(sentence_l):
    sentence_l_l = []
    sentence_split_l = ['\n', '\t', '。', '；', '？', '.', ';', '?', '...', '、、、', '，', ',']
    for i in sentence_l:
        for ii in sentence_split_l:
            ll = i.split(ii)
            if len(ll) > 1:
                sentence_l_l += ll
            else:
                sentence_l_l.append(i)
                continue

    return sentence_l_l


import requests, time, threading

img_dir = 'C:\\Users\\sas\\PycharmProjects\\py_win_to_unix\\crontab_chk_url\\personas\\trunk\\plugins\\spider\\dl_img_tmp\\'
img_dir = 'C:\\Users\\sas\\PycharmProjects\\produce_video\\mypng\\'
import random

import os, time, glob

os_sep = os.sep
this_file_abspath = os.path.abspath(__file__)
this_file_dirname, this_file_name = os.path.dirname(this_file_abspath), os.path.abspath(__file__).split(os_sep)[
    -1]
fw_f = '{}{}'.format(this_file_name, '.txt')
fw_f_onerow = '{}{}'.format(fw_f.replace( '.txt',''), '.txt')




with open(fw_f, 'w', encoding='utf-8') as fw_txt:
    with open(fw_f_onerow, 'w', encoding='utf-8') as fw_txt_onerow:
        for uid in uid_d:
            str_ = uid_d[uid]['html']
            fhtml = 'qqzong.vedio.allinone.tmp.html'
            fhtml = '{}{}{}{}'.format('D:\\myv\\myhtml\\', int(time.time()), random.randint(1234, 6789), fhtml)
            with open(fhtml, 'w', encoding='utf-8') as fw:
                fw.write(str_)
            with open(fhtml, 'r', encoding='utf-8') as fo:
                soup = BeautifulSoup(fo, 'html.parser')
                sentence_l = Cut(list(cutlist), list(soup.text))

            # 过滤句子单条长度 条数
            sen_num = 32
            sen_pass = False
            if len(sentence_l) < sen_num:
                sen_pass = True
                continue

            for sen in sentence_l:
                if len(sen) > 64:
                    sen_pass = True
                    break
            if sen_pass:
                continue
            s = '{}{}{}'.format('-----------------------', uid, '----------------------------------------\n')
            fw_txt.write(s)
            fw_txt_onerow(s)
            n = 0
            for sen in sentence_l:
                s = '{}{}'.format(sen, '\n')
                print(s)
                fw_txt.write(s)
                n += 1
                if n == 31:
                    break
            # 联系方式：王经理13212312312
            fw_txt_onerow(''.join(sentence_l[0:31]))
            s = '{}{}{}'.format('联系方式：王经理', random.randint(13200000000, 15812341234), '\n')
            fw_txt.write(s)
            fw_txt_onerow(s)

dd = 9
posted @ 2018-04-17 13:52 papering 阅读(2068) 评论(0) 收藏举报
刷新页面返回顶部