本文整理汇总了Python中pynlpl.statistics.FrequencyList类的典型用法代码示例。如果您正苦于以下问题:Python FrequencyList类的具体用法?Python FrequencyList怎么用?Python FrequencyList使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FrequencyList类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_freqlist_tokencount
def test_freqlist_tokencount(self):
"""Frequency List (count tokens)"""
global sentences
f= FrequencyList()
for sentence in sentences:
f.append(sentence)
self.assertEqual(f.total,13)
开发者ID:493238731,项目名称:pynlpl,代码行数:7,代码来源:statistics.py
示例2: test_freqlist_typecount
def test_freqlist_typecount(self):
"""Frequency List (count types)"""
global sentences
f= FrequencyList()
for sentence in sentences:
f.append(sentence)
self.assertEqual(len(f),9)
开发者ID:493238731,项目名称:pynlpl,代码行数:7,代码来源:statistics.py
示例3: test_freqlist_caseinsens
def test_freqlist_caseinsens(self):
"""Bigram Frequency List (case insensitive)"""
global sentences
f= FrequencyList(None, False)
for sentence in sentences:
f.append(Windower(sentence,2))
self.assertTrue(( f[('is','a')] == 2 and f[('this','is')] == 1))
开发者ID:493238731,项目名称:pynlpl,代码行数:7,代码来源:statistics.py
示例4: buildclasser
def buildclasser(file):
freqlist = FrequencyList()
f = open(file,'r')
for line in f:
line = line.strip()
freqlist.append(line.split(' '))
f.close()
return Classer(freqlist)
开发者ID:pombredanne,项目名称:nlpsandbox,代码行数:8,代码来源:wordalign_emibm1.py
示例5: buildfromtext
def buildfromtext(self, files, encoding='utf-8'):
freqlist = FrequencyList()
if isinstance(files, str): files = [files]
for filename in files:
with open(filename, 'r',encoding=encoding) as f:
for line in f:
tokens = line.strip().split()
freqlist.append(tokens)
self.buildfromfreqlist(freqlist)
开发者ID:pombredanne,项目名称:dev_chatbot,代码行数:10,代码来源:textprocessors.py
示例6: process
def process(filename):
print >>sys.stderr, "Processing " + filename
doc = folia.Document(file=filename)
freqlist = FrequencyList()
if settings.n == 1:
for word in doc.words():
text = word.toktext()
if settings.casesensitive: text = text.lower()
freqlist.count(text)
elif settings.sentencemarkers:
for sentence in doc.sentences():
for ngram in Windower(sentence.words(), settings.n):
text = ' '.join([x for x in ngram.toktext() ])
if settings.casesensitive: text = text.lower()
freqlist.count(text)
else:
for word in Windower(sentence.words(), settings.n, None, None):
text = ' '.join([x for x in ngram.toktext() ])
if settings.casesensitive: text = text.lower()
freqlist.count(text)
if settings.autooutput:
if filename[-len(settings.extension) - 1:].lower() == '.' +settings.extension:
outfilename = filename[:-len(settings.extension) - 1] + '.freqlist'
else:
outfilename += '.freqlist'
freqlist.save(outfilename,True)
return freqlist
开发者ID:larsmans,项目名称:folia,代码行数:31,代码来源:foliafreqlist.py
示例7: buildfromfolia
def buildfromfolia(self, files, encoding='utf-8'):
freqlist = FrequencyList()
if isinstance(files, str): files = [files]
for filename in files:
f = folia.Document(file=filename)
for sentence in f.sentences():
tokens = sentence.toktext().split(' ')
freqlist.append(tokens)
self.buildfromfreqlist(freqlist)
开发者ID:pombredanne,项目名称:dev_chatbot,代码行数:11,代码来源:textprocessors.py
示例8: main
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "o:OE:htspwrq", ["help"])
except getopt.GetoptError as err:
print(str(err),file=sys.stderr)
usage()
sys.exit(2)
outputfile = None
for o, a in opts:
if o == '-h' or o == '--help':
usage()
sys.exit(0)
elif o == '-e':
settings.encoding = a
elif o == '-E':
settings.extension = a
elif o == '-o':
outputfile = a
elif o == '-O':
settings.autooutput = True
elif o == '-s':
settings.sentencemarkers = True
elif o == '-r':
settings.recurse = True
elif o == '-q':
settings.ignoreerrors = True
else:
raise Exception("No such option: " + o)
if outputfile: outputfile = io.open(outputfile,'w',encoding=settings.encoding)
if len(sys.argv) >= 2:
freqlist = FrequencyList()
for x in sys.argv[1:]:
if os.path.isdir(x):
processdir(x,freqlist)
elif os.path.isfile(x):
freqlist += process(x)
else:
print("ERROR: File or directory not found: " + x,file=sys.stderr)
sys.exit(3)
if outputfile:
freqlist.save(outputfile, True)
else:
for line in freqlist.output("\t", True):
print(line)
else:
print("ERROR: No files specified",file=sys.stderr)
sys.exit(2)
开发者ID:Sandy4321,项目名称:folia,代码行数:53,代码来源:foliafreqlist.py
示例9: load
def load(self, filename):
self.freqlistN = FrequencyList(None, self.casesensitive)
self.freqlistNm1 = FrequencyList(None, self.casesensitive)
f = io.open(filename,'r',encoding='utf-8')
mode = False
for line in f.readlines():
line = line.strip()
if line:
if not mode:
if line != "[simplelanguagemodel]":
raise Exception("File is not a SimpleLanguageModel")
else:
mode = 1
elif mode == 1:
if line[:2] == 'n=':
self.n = int(line[2:])
elif line[:12] == 'beginmarker=':
self.beginmarker = line[12:]
elif line[:10] == 'endmarker=':
self.endmarker = line[10:]
elif line[:10] == 'sentences=':
self.sentences = int(line[10:])
elif line[:14] == 'casesensitive=':
self.casesensitive = bool(int(line[14:]))
self.freqlistN = FrequencyList(None, self.casesensitive)
self.freqlistNm1 = FrequencyList(None, self.casesensitive)
elif line == "[freqlistN]":
mode = 2
else:
raise Exception("Syntax error in language model file: ", line)
elif mode == 2:
if line == "[freqlistNm1]":
mode = 3
else:
try:
type, count = line.split("\t")
self.freqlistN.count(type.split(' '),int(count))
except:
print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr)
elif mode == 3:
try:
type, count = line.split("\t")
self.freqlistNm1.count(type.split(' '),int(count))
except:
print("Warning, could not parse line whilst loading frequency list: ", line,file=stderr)
if self.beginmarker:
self._begingram = [self.beginmarker] * (self.n-1)
if self.endmarker:
self._endgram = [self.endmarker] * (self.n-1)
开发者ID:amitbeka,项目名称:pynlpl,代码行数:50,代码来源:lm.py
示例10: __init__
def __init__(self, n=2, casesensitive = True, beginmarker = "<begin>", endmarker = "<end>"):
self.casesensitive = casesensitive
self.freqlistN = FrequencyList(None, self.casesensitive)
self.freqlistNm1 = FrequencyList(None, self.casesensitive)
assert isinstance(n,int) and n >= 2
self.n = n
self.beginmarker = beginmarker
self.endmarker = endmarker
self.sentences = 0
if self.beginmarker:
self._begingram = tuple([self.beginmarker] * (n-1))
if self.endmarker:
self._endgram = tuple([self.endmarker] * (n-1))
开发者ID:amitbeka,项目名称:pynlpl,代码行数:15,代码来源:lm.py
示例11: buildclasser
def buildclasser():
global DOTOKENIZE, ENCODING, outputprefix
log("Counting unigrams (for classer) ...",stream=sys.stderr)
freqlist = FrequencyList()
f = open(corpusfile)
for i, line in enumerate(f):
if (i % 10000 == 0):
log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr)
if DOTOKENIZE:
line = crude_tokenizer(line.strip())
line = line.strip().split(' ')
freqlist.append(['<begin>'] + line + ['<end>'])
f.close()
log("Building classer ...", stream=sys.stderr)
classer = Classer(freqlist)
classer.save(outputprefix + '.cls')
log("\t" + str(len(classer)) + " classes found", stream=sys.stderr)
return classer
开发者ID:pombredanne,项目名称:nlpsandbox,代码行数:19,代码来源:extractphraselist2.py
示例12: main
def main():
try:
opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
except getopt.GetoptError as err:
# print help information and exit:
print(str(err),file=sys.stderr)
usage()
sys.exit(2)
testsetsize = devsetsize = 0
casesensitive = True
encoding = 'utf-8'
n = 1
for o, a in opts:
if o == "-n":
n = int(a)
elif o == "-i":
casesensitive = False
elif o == "-e":
encoding = a
else:
print("ERROR: Unknown option:",o,file=sys.stderr)
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)
开发者ID:Acidburn0zzz,项目名称:pynlpl,代码行数:51,代码来源:freqlist.py
示例13: process
def process(filename):
try:
print("Processing " + filename,file=sys.stderr)
doc = folia.Document(file=filename)
freqlist = FrequencyList()
if settings.n == 1:
for word in doc.words():
text = word.toktext()
if settings.casesensitive: text = text.lower()
freqlist.count(text)
elif settings.sentencemarkers:
for sentence in doc.sentences():
for ngram in Windower(sentence.words(), settings.n):
text = ' '.join([x for x in ngram.toktext() ])
if settings.casesensitive: text = text.lower()
freqlist.count(text)
else:
for word in Windower(sentence.words(), settings.n, None, None):
text = ' '.join([x for x in ngram.toktext() ])
if settings.casesensitive: text = text.lower()
freqlist.count(text)
if settings.autooutput:
if filename[-len(settings.extension) - 1:].lower() == '.' +settings.extension:
outfilename = filename[:-len(settings.extension) - 1] + '.freqlist'
else:
outfilename += '.freqlist'
freqlist.save(outfilename,True)
except Exception as e:
if settings.ignoreerrors:
print("ERROR: An exception was raised whilst processing " + filename, e,file=sys.stderr)
else:
raise
return freqlist
开发者ID:Sandy4321,项目名称:folia,代码行数:37,代码来源:foliafreqlist.py
示例14: main
def main():
parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")
args = parser.parse_args()
if not args.files:
print("No files specified", file=sys.stderr)
sys.exit(1)
freqlist = FrequencyList(None, args.caseinsensitive)
for filename in args.files:
f = io.open(filename,'r',encoding=args.encoding)
for line in f:
if args.ngramsize > 1:
freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)
开发者ID:493238731,项目名称:pynlpl,代码行数:36,代码来源:freqlist.py
示例15: open
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pynlpl.textprocessors import Classer
from pynlpl.statistics import FrequencyList
import sys
filename = sys.argv[1]
print >>sys.stderr, "Counting tokens"
f = open(filename)
freqlist = FrequencyList()
for i, line in enumerate(f):
if (i % 10000 == 0):
print >>sys.stderr, "\tLine " + str(i+1)
line = ['<s>'] + line.strip().split(' ') + ['</s>']
freqlist.append(line)
f.close()
print >>sys.stderr, "Building classer"
classer = Classer(freqlist, filesupport=True )
classer.save(filename + '.cls')
print >>sys.stderr, "Encoding data"
classer.encodefile(filename, filename + '.clsenc')
开发者ID:antiface,项目名称:pynlpl,代码行数:26,代码来源:classencode.py
示例16: open
#!/usr/bin/env python3
import sys
from pynlpl.statistics import FrequencyList
for filename in sys.argv[1:]:
f_in = open(filename,'rt',encoding='utf-8')
freqlist = FrequencyList()
for line in f_in:
fields = line.strip().split('\t')
count = int(fields[1])
for lemma in fields[0].split(' '):
freqlist.count(lemma, count)
f_in.close()
freqlist.save(filename + '.freqlist')
开发者ID:pombredanne,项目名称:nlpsandbox,代码行数:15,代码来源:searchdidier_freqlist.py
示例17: int
for o, a in opts:
if o == "-n":
n = int(a)
elif o == "-i":
casesensitive = False
elif o == "-e":
encoding = a
else:
print >>sys.stderr, "ERROR: Unknown option:",o
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
开发者ID:antiface,项目名称:pynlpl,代码行数:31,代码来源:freqlist.py
示例18: FrequencyList
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pynlpl.statistics import FrequencyList
from pynlpl.textprocessors import crude_tokenizer, Classer
import sys
import codecs
import asizeof
freqlist = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
line = crude_tokenizer(line.strip())
freqlist.append(line)
f.close()
print "FREQLIST: " ,asizeof.asizeof(freqlist)
classer = Classer(freqlist)
print "CLASSER: " ,asizeof.asizeof(classer)
classer2 = Classer(freqlist, False,True)
print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2)
freqlist2 = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
开发者ID:pombredanne,项目名称:nlpsandbox,代码行数:31,代码来源:classertest.py
示例19: Exception
elif o == '-o':
outputfile = a
elif o == '-O':
settings.autooutput = True
elif o == '-s':
settings.sentencemarkers = True
elif o == '-r':
settings.recurse = True
else:
raise Exception("No such option: " + o)
if outputfile: outputfile = codecs.open(outputfile,'w',settings.encoding)
if len(sys.argv) >= 2:
freqlist = FrequencyList()
for x in sys.argv[1:]:
if os.path.isdir(x):
processdir(x,freqlist)
elif os.path.isfile(x):
freqlist += process(x)
else:
print >>sys.stderr, "ERROR: File or directory not found: " + x
sys.exit(3)
if outputfile:
freqlist.save(outputfile, True)
else:
for line in freqlist.output("\t", True):
print line
else:
print >>sys.stderr,"ERROR: No files specified"
开发者ID:larsmans,项目名称:folia,代码行数:31,代码来源:foliafreqlist.py
示例20: FrequencyList
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pynlpl.textprocessors import Windower, crude_tokenizer
from pynlpl.statistics import FrequencyList, Distribution
import sys
import codecs
with codecs.open(sys.argv[1],'r','utf-8') as file:
freqlist = FrequencyList()
for line in file:
freqlist.append(Windower(crude_tokenizer(line),2))
print "Type/Token Ratio: ", freqlist.typetokenratio()
### uncomment if you want to output the full frequency list:
#for line in freqlist.output():
# print line.encode('utf-8')
dist = Distribution(freqlist)
for line in dist.output():
print line.encode('utf-8')
开发者ID:493238731,项目名称:pynlpl,代码行数:24,代码来源:freqlist.py
注:本文中的pynlpl.statistics.FrequencyList类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论