【parser】stanford-parser demo使用
2013-05-08 19:41 Loull 阅读(1485) 评论(0) 编辑 收藏 举报测试站点:
http://nlp.stanford.edu:8080/parser/index.jsp
先贴点代码,是stanfor-parser的demo:
import java.util.Collection;
import java.util.List;
import java.io.StringReader;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
class ParserDemo {
/**
* The main method demonstrates the easiest way to load a parser.
* Simply call loadModel and specify the path, which can either be a
* file or any resource in the classpath. For example, this
* demonstrates loading from the models jar file, which you need to
* include in the classpath for ParserDemo to work.
*/
public static void main(String[] args) {
LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
if (args.length > 0) {
demoDP(lp, args[0]);
} else {
demoAPI(lp);
}
}
/**
* demoDP demonstrates turning a file into tokens and then parse
* trees. Note that the trees are printed by calling pennPrint on
* the Tree object. It is also possible to pass a PrintWriter to
* pennPrint if you want to capture the output.
*/
public static void demoDP(LexicalizedParser lp, String filename) {
// This option shows loading and sentence-segmenting and tokenizing
// a file using DocumentPreprocessor.
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
// You could also create a tokenizer here (as below) and pass it
// to DocumentPreprocessor
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
Tree parse = lp.apply(sentence);
parse.pennPrint();
System.out.println();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
}
}
/**
* demoAPI demonstrates other ways of calling the parser with
* already tokenized text, or in some cases, raw text that needs to
* be tokenized as a single sentence. Output is handled with a
* TreePrint object. Note that the options used when creating the
* TreePrint can determine what results to print out. Once again,
* one can capture the output by passing a PrintWriter to
* TreePrint.printTree.
*/
public static void demoAPI(LexicalizedParser lp) {
// This option shows parsing a list of correctly tokenized words
String[] sent = { "This", "is", "an", "easy", "sentence", "." };
List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
Tree parse = lp.apply(rawWords);
parse.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
String sent2 = "This is another sentence.";
TokenizerFactory<CoreLabel> tokenizerFactory =
PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
List<CoreLabel> rawWords2 =
tokenizerFactory.getTokenizer(new StringReader(sent2)).tokenize();
parse = lp.apply(rawWords2);
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
tp.printTree(parse);
}
private ParserDemo() {} // static methods only
}
结果:
Your query
猴子喜欢吃香蕉。
Segmentation
猴子
喜欢
吃
香蕉
。
Tagging
猴子/NR
喜欢/VV
吃/VV
香蕉/NN
。/PU
Parse
(ROOT
(IP
(NP (NR 猴子))
(VP (VV 喜欢)
(IP
(VP (VV 吃)
(NP (NN 香蕉)))))
(PU 。)))
Typed dependencies
nsubj(喜欢-2, 猴子-1)
root(ROOT-0, 喜欢-2)
ccomp(喜欢-2, 吃-3)
dobj(吃-3, 香蕉-4)
Typed dependencies, collapsed
nsubj(喜欢-2, 猴子-1)
root(ROOT-0, 喜欢-2)
ccomp(喜欢-2, 吃-3)
dobj(吃-3, 香蕉-4)