(Code) Python implementation of phrase extraction from sentence
import os import numpy as np import pandas as pd from tqdm import tqdm import numpy as np import string import nltk from nltk.tokenize import word_tokenize from textblob import TextBlob import pdb max_phrase_length = 5 basicPath = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/' path = basicPath files = os.listdir(path) print(path) word_base_path = '/media/wangxiao/b8efbc67-7ea5-476d-9631-70da75f84e2d/train_dataset/word_list.txt' wordBase = open(word_base_path, 'r') wordList = [] lines = wordBase.readlines() for line in lines: line_ = line.rstrip('\n').rstrip('.') # pdb.set_trace() wordList.append(line_) for i in range(len(files)): videoName = files[i] print videoName langPath = path + videoName + '/language.txt' ## for other datset # langPath = path + videoName + '/' + videoName+'.txt' f = open(langPath, 'r') language = f.readline() words = word_tokenize(language) token_results = nltk.pos_tag(words) blob = TextBlob(language) print blob.noun_phrases langPath_Phrase = path + videoName + '/auto_extracted_Phrase.txt' f_phrase = open(langPath_Phrase, 'w') langPath_PhraseIndex = path + videoName + '/autoExtracted_Phrase_Index.txt' f_phrase_Idx = open(langPath_PhraseIndex, 'w') # pdb.set_trace() for j in range(len(blob.noun_phrases)): phrase = blob.noun_phrases[j] f_phrase.write(phrase) f_phrase.write('\n') written_num = 0 if len(phrase) > 1: word_ = word_tokenize(phrase) for phraseIndex in range(len(word_)): wordINDEX = wordList.index(word_[phraseIndex]) f_phrase_Idx.write(str(wordINDEX)) f_phrase_Idx.write(',') written_num = written_num + 1 if written_num < max_phrase_length: diff_num = max_phrase_length - written_num for k in range(diff_num): f_phrase_Idx.write('0') f_phrase_Idx.write(',') f_phrase_Idx.write('\n')
Stay Hungry,Stay Foolish ...