本文整理汇总了Java中edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory类的典型用法代码示例。如果您正苦于以下问题:Java PTBTokenizerFactory类的具体用法?Java PTBTokenizerFactory怎么用?Java PTBTokenizerFactory使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
PTBTokenizerFactory类属于edu.stanford.nlp.process.PTBTokenizer包,在下文中一共展示了PTBTokenizerFactory类的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: applyPTBTokenizer
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; //导入依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
dp.setTokenizerFactory(tf);
List<String> sentences = new ArrayList<>();
for (List<HasWord> wordList : dp) {
String sentence = "";
for (HasWord word : wordList) {
sentence += " " + splitCompounds(word.word());
}
sentences.add(sentence);
}
return sentences;
}
开发者ID:infolis,项目名称:infoLink,代码行数:14,代码来源:TokenizerStanford.java
示例2: TaggerWrapper
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; //导入依赖的package包/类
protected TaggerWrapper(MaxentTagger tagger) {
this.tagger = tagger;
this.config = tagger.config;
try {
tokenizerFactory =
chooseTokenizerFactory(config.getTokenize(),
config.getTokenizerFactory(),
config.getTokenizerOptions(),
config.getTokenizerInvertible());
} catch (Exception e) {
System.err.println("Error in tokenizer factory instantiation for class: " + config.getTokenizerFactory());
e.printStackTrace();
tokenizerFactory = PTBTokenizerFactory.newWordTokenizerFactory(config.getTokenizerOptions());
}
outputStyle = OutputStyle.fromShortName(config.getOutputFormat());
outputVerbosity = config.getOutputVerbosity();
outputLemmas = config.getOutputLemmas();
morpha = (outputLemmas) ? new Morphology() : null;
tokenize = config.getTokenize();
tagSeparator = config.getTagSeparator();
}
开发者ID:benblamey,项目名称:stanford-nlp,代码行数:24,代码来源:MaxentTagger.java
示例3: getWordsFromString
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; //导入依赖的package包/类
public static List<Word> getWordsFromString(String str) {
PTBTokenizerFactory<Word> factory = (PTBTokenizerFactory<Word>)PTBTokenizer.factory();
// Stanford's tokenizer actually changes words to American...altering our original text. Stop it!!
factory.setOptions("americanize=false");
Tokenizer<Word> tokenizer = factory.getTokenizer(new BufferedReader(new StringReader(str)));
return tokenizer.tokenize();
}
开发者ID:nchambers,项目名称:probschemas,代码行数:8,代码来源:Ling.java
示例4: chooseTokenizerFactory
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; //导入依赖的package包/类
protected static TokenizerFactory<? extends HasWord>
chooseTokenizerFactory(boolean tokenize, String tokenizerFactory,
String tokenizerOptions, boolean invertible) {
if (tokenize && tokenizerFactory.trim().length() != 0) {
//return (TokenizerFactory<? extends HasWord>) Class.forName(getTokenizerFactory()).newInstance();
try {
@SuppressWarnings({"unchecked"})
Class<TokenizerFactory<? extends HasWord>> clazz = (Class<TokenizerFactory<? extends HasWord>>) Class.forName(tokenizerFactory.trim());
Method factoryMethod = clazz.getMethod("newTokenizerFactory");
@SuppressWarnings({"unchecked"})
TokenizerFactory<? extends HasWord> factory = (TokenizerFactory<? extends HasWord>) factoryMethod.invoke(tokenizerOptions);
return factory;
} catch (Exception e) {
throw new RuntimeException("Could not load tokenizer factory", e);
}
} else if (tokenize) {
if (invertible) {
if (tokenizerOptions.equals("")) {
tokenizerOptions = "invertible=true";
} else if (!tokenizerOptions.matches("(^|.*,)invertible=true")) {
tokenizerOptions += ",invertible=true";
}
return PTBTokenizerFactory.newCoreLabelTokenizerFactory(tokenizerOptions);
} else {
return PTBTokenizerFactory.newWordTokenizerFactory(tokenizerOptions);
}
} else {
return WhitespaceTokenizer.factory();
}
}
开发者ID:benblamey,项目名称:stanford-nlp,代码行数:31,代码来源:MaxentTagger.java
示例5: getSentences1_old
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; //导入依赖的package包/类
public static List<String> getSentences1_old(String text, Set<String> entities){
text=text.trim();
text=StringEscapeUtils.escapeHtml(text);
text=text.replaceAll("http:.*…\\z","");
String[] toMatch={"\\ART\\[email protected]\\S+", "\\AMT\\[email protected]\\S+"};
for(String t:toMatch){
Pattern pattern = Pattern.compile(t, Pattern.CASE_INSENSITIVE);
String newTweet = text.trim();
text="";
while(!newTweet.equals(text)){ //each loop will cut off one "RT @XXX" or "#XXX"; may need a few calls to cut all hashtags etc.
text=newTweet;
Matcher matcher = pattern.matcher(text);
newTweet = matcher.replaceAll("");
newTweet =newTweet.trim();
}
}
text=text.replaceAll("-\\s*\\z","");
text=text.replaceAll("…\\z","");
text=StringEscapeUtils.unescapeHtml(text);
text=text.trim();
String[] parts=text.split(Extractor.urlRegExp);
List<String> sentences=new ArrayList<String>();
// for(int i=0;i<parts.length;i++){
int limit=10;
if(limit>parts.length)
limit=parts.length;
for(int i=0;i<limit;i++){
// parts[i]=text.replace("http://*…","");
String text_cleaned=extractor.cleanText(parts[i]);
// List<String> sentences_tmp=new ArrayList<String>();
Reader reader = new StringReader(text_cleaned);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("ptb3Escaping=false,untokenizable=noneDelete"));
//prop.setProperty("tokenizerOptions", "untokenizable=noneDelete");
Iterator<List<HasWord>> it = dp.iterator();
while (it.hasNext()) {
StringBuilder sentenceSb = new StringBuilder();
List<HasWord> sentence = it.next();
boolean last_keep=false;
for (HasWord token : sentence) {
if((!token.word().matches("[,:!.;?)]"))&&(!token.word().contains("'"))&&!last_keep){
sentenceSb.append(" ");
}
last_keep=false;
if(token.word().matches("[(\\[]"))
last_keep=true;
String next_word=token.toString();
if((next_word.toUpperCase().equals(next_word))&&(!next_word.equals("I"))&&(!entities.contains(next_word)))
next_word=next_word.toLowerCase();
if(next_word.equals("i")) next_word="I";
sentenceSb.append(next_word);
}
String new_sentence=sentenceSb.toString().trim();
Character fc=new_sentence.charAt(0);
new_sentence=fc.toString().toUpperCase()+new_sentence.substring(1);
if(new_sentence.endsWith(":"))
text=text.substring(0,text.length()-3)+".";
sentences.add(new_sentence);
}
// sentences.addAll(sentences_tmp);
}
return sentences;
}
开发者ID:socialsensor,项目名称:trends-labeler,代码行数:68,代码来源:TrendsLabeler.java
示例6: getSentences1
import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; //导入依赖的package包/类
public static List<String> getSentences1(String text, Set<String> entities) {
// System.out.println(" Text as it is : " + text);
text = TrendsLabeler.getCleanedTitleMR(text);
String[] parts = text.split(Extractor.urlRegExp);
List<String> sentences = new ArrayList<String>();
// for(int i=0;i<parts.length;i++){
int limit = 10;
if (limit > parts.length)
limit = parts.length;
for (int i = 0; i < limit; i++) {
String text_cleaned = extr.cleanText(parts[i]);
// List<String> sentences_tmp=new ArrayList<String>();
Reader reader = new StringReader(text_cleaned);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
dp.setTokenizerFactory(PTBTokenizerFactory
.newWordTokenizerFactory("ptb3Escaping=false, untokenizable=noneDelete"));
// dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("untokenizable=noneDelete"));
Iterator<List<HasWord>> it = dp.iterator();
while (it.hasNext()) {
StringBuilder sentenceSb = new StringBuilder();
List<HasWord> sentence = it.next();
boolean last_keep = false;
for (HasWord token : sentence) {
if ((!token.word().matches("[,:!.;?)]"))
&& (!token.word().contains("'")) && !last_keep) {
sentenceSb.append(" ");
}
last_keep = false;
if (token.word().matches("[(\\[]"))
last_keep = true;
String next_word = token.toString();
if ((next_word.toUpperCase().equals(next_word))
&& (!next_word.equals("I"))
&& (!entities.contains(next_word)))
next_word = next_word.toLowerCase();
if (next_word.equals("i"))
next_word = "I";
sentenceSb.append(next_word);
}
String new_sentence = sentenceSb.toString().trim();
Character fc = new_sentence.charAt(0);
new_sentence = fc.toString().toUpperCase()
+ new_sentence.substring(1);
if (new_sentence.endsWith(":"))
text = text.substring(0, text.length() - 3) + ".";
sentences.add(new_sentence);
}
// sentences.addAll(sentences_tmp);
}
return sentences;
}
开发者ID:socialsensor,项目名称:trends-labeler,代码行数:57,代码来源:TrendsLabeler.java
注:本文中的edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论