本文整理汇总了Python中pyth.plugins.plaintext.writer.PlaintextWriter类的典型用法代码示例。如果您正苦于以下问题:Python PlaintextWriter类的具体用法?Python PlaintextWriter怎么用?Python PlaintextWriter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PlaintextWriter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: read_rtf_text
def read_rtf_text(fp, errors='strict', encoding='utf-8'):
doc = CustomRtf15Reader.read(fp, errors=errors)
for p in doc.content:
p.content = filter(paragraph_is_text_like, p.content)
return PlaintextWriter.write(doc).read().decode(encoding)
开发者ID:labhackercd,项目名称:novo-retorica,代码行数:7,代码来源:utils.py
示例2: convert_to_txt
def convert_to_txt(file_path):
logger.debug("convert_to_txt: %s" % file_path)
words = None
if not os.path.exists(file_path):
logger.error("missing file %s", file_path)
file_size = os.stat(file_path).st_size
logger.debug("convert_to_txt: %d bytes at %s",file_size, file_path)
ext = _get_extension(file_path)
if ext == '.txt':
logger.debug("loading txt file")
worked = False
try:
encoding, file_handle, words = open_with_correct_encoding(file_path)
except Exception as e:
logger.error("Wasn't able to read the words from the file %s" % file_path)
words = ""
elif ext == '.docx':
logger.debug("loading docx file")
words = _docx_to_txt(file_path)
elif ext == '.rtf':
logger.debug("loading rtf file")
doc = Rtf15Reader.read(open(file_path))
words = PlaintextWriter.write(doc).getvalue()
else:
logging.warning("Couldn't find an extension on the file, so assuming text")
with codecs.open(file_path, 'r', ENCODING_UTF_8) as myfile:
words = myfile.read()
logger.debug("loaded %d chars" % len(words))
return words
开发者ID:c4fcm,项目名称:DataBasic,代码行数:29,代码来源:filehandler.py
示例3: GetExternal
def GetExternal(version, odl_data, source, class_id):
external = ""
for item in version[2]:
if item[0] == "Attribute" \
and item[1] == "_Art1_RTF":
if len(item[2]) == 2:
if isinstance(source, ZipFile):
data = source.open(item[2][0]).read()
else:
file_name = join(source, item[2][0])
f = open(file_name, 'rb')
data = f.read()
f.close()
data = data.replace("\x0c", "")
elif len(item[2]) == 1:
data = item[2][0]
if data == "":
return ""
f = StringIO()
f.write(data)
doc = Rtf15Reader.read(f, clean_paragraphs = False)
external = PlaintextWriter.write(doc).getvalue()
external = external.replace("\n\n", "\n")
return ReplaceTextNames(external, version, odl_data, class_id)
开发者ID:jeroenk,项目名称:artisanConvert,代码行数:29,代码来源:odl_extract.py
示例4: read_recommendations
def read_recommendations(self, file_name):
"""
Function reads the targeted values from the file "WHO Daily Recommended Values.rtf"
It process the entries and creates a dictionary with
Nutrient name as Key and Nutrient Value as value
:param file_name:
:return:
"""
target = dict()
filtered_col = list()
doc = Rtf15Reader.read(open(file_name))
entities = PlaintextWriter.write(doc).getvalue().split('\n\n')
for item in entities:
splited = item.split(',')
name = splited[0].split('(')[0]
value = splited[1]
try:
unit = splited[0].split('(')[1].split(')')[0]
except:
unit = ''
# target.append({'nutrient': name,
# 'unit': unit,
# 'value': value})
target.update({name: value})
filtered_col.append(name)
self.target_values = target
return target, filtered_col
开发者ID:Basit-qc,项目名称:WHO---Food-Menu,代码行数:27,代码来源:buildmenu.py
示例5: upload
def upload(request):
# user uploads a document -> convert into a dict of the terms found
if request.FILES:
if 'file' in request.FILES:
result = ''
f = request.FILES['file']
fp = 'shake_v3/static/data/' + str(f)
fp2 = fp[:len(fp)-3] + 'txt'
if fp[len(fp)-3:len(fp)] == 'pdf':
with open(fp, 'wb+') as pdff:
for chunk in f.chunks():
pdff.write(chunk)
result = pdf_to_txt(fp)
with open(fp2, 'wb+') as txtf:
txtf.write(result)
elif fp[len(fp)-3:len(fp)] == 'rtf':
with open(fp, 'wb+') as rtff:
for line in f:
rtff.write(line)
doc = Rtf15Reader.read(open(fp, 'rb'))
doctxt = PlaintextWriter.write(doc).getvalue()
with open(fp2, 'wb+') as txtf:
for line in doctxt:
txtf.write(line)
f = str(f)[:-4] + ".txt"
result = doctxt
else:
with open(fp2, 'wb+') as txtf:
for line in f:
txtf.write(line)
result = open(fp2, 'r').read()
response_dict = generate_term_dict(result)
response_dict['fp'] = 'static/data/' + str(f)
return HttpResponse(simplejson.dumps(response_dict), mimetype='application/javascript')
# user indicates terms -> give a grade
elif request.POST:
#TO DO: implement saving the data
rating = ""
score = custom_POST_to_score(request)
if score > 4.5:
rating = 'A+'
elif score > 4:
rating = 'A'
elif score > 3.5:
rating = 'B+'
elif score > 3:
rating = 'B'
elif score > 2.5:
rating = 'C+'
elif score > 2:
rating = 'C'
elif score > 1:
rating = 'D'
else:
rating = 'F'
return HttpResponse(rating)
# display the upload part 1
else:
score = 0
return render_to_response('upload.html', {'score': score}, context_instance = RequestContext(request))
开发者ID:vickimo,项目名称:shakev3,代码行数:60,代码来源:views.py
示例6: analyze
def analyze(committeeFile):
try:
doc = Rtf15Reader.read(open(committeeFile, "rb"))
except:
print "%s - skipped..." % committeeFile
errFile = committeeFile.replace(global_options.indir, global_options.errdir)
shutil.copyfile(committeeFile, errFile)
return False
#print PlaintextWriter.write(doc).getValue()
f = open("test.out", 'w')
f.write(PlaintextWriter.write(doc).getvalue())
f.close()
f = open("test.out", 'r')
participants = find_participants(f.read())
f.close()
# Getting the indication whether the participant spoke in the committee
f = open("test.out", 'r')
docstring = f.read()
for line in docstring.splitlines():
name = ''
if ":" in line:
participant = line.split(":")[0]
for p in participants:
if participant in p['name']:
p['speaker'] = True
p['speak_count'] += 1
f.close()
fname = committeeFile.replace(global_options.indir, global_options.outdir)
fname = fname.replace("rtf", "txt")
file = codecs.open(fname, "w", "utf-8")
for participant in participants:
string_builder = []
for key, val in participant.iteritems():
string = u"'%s': '%s'"
if val is not None:
if type(val) == str:
val = val.replace("'", "")
val = val.replace('"', '')
string = string % (key, print_unicode(val))
string_builder.append(string)
wrt_ln = ', '.join(string_builder)
wrt_ln += ',\n'
try:
file.write(wrt_ln)
except UnicodeEncodeError:
print wrt_ln
file.close()
verbose("Generated participants file: " + fname)
return True
开发者ID:assafsinvani,项目名称:gknesset,代码行数:59,代码来源:analyze_protocols.py
示例7: extract_terms
def extract_terms(rtffile):
""" Get data from rtffile """
judges_list = []
rtf_text = PlaintextWriter.write(rtffile).getvalue()
lines = re.split('\n',rtf_text)
for line in itertools.islice(lines, 0, None, 4): # 1: from the second line ([1]),
judges_list.append(line) # None: to the end,
return judges_list # 2: step
开发者ID:JonathanBowker,项目名称:memex-gate,代码行数:8,代码来源:scrape_legal_lexicon.py
示例8: load_stickies
def load_stickies(path):
stickies = []
with open(path) as fd:
for i,rtf in enumerate(parse_sticky_database(fd.read())):
doc = Rtf15Reader.read(StringIO.StringIO(rtf))
plaintext = PlaintextWriter.write(doc).getvalue()
stickies.append(plaintext)
return stickies
开发者ID:alexflint,项目名称:sticky-sync,代码行数:8,代码来源:client.py
示例9: get_rtf_text
def get_rtf_text(path):
"""
Take the path of an rtf file as an argument and return the text
"""
doc = Rtf15Reader.read(open(path))
return PlaintextWriter.write(doc).getvalue()
开发者ID:vignesh117,项目名称:MusicalText,代码行数:9,代码来源:makecorpusfirstset.py
示例10: readRtf
def readRtf(self, path):
try:
doc = Rtf15Reader.read(open(path, "rb"))
except:
self._log("Some screwy rtf shit going on with " + path)
return "Can't process ur shitty rtf <3 dfbot"
contents = PlaintextWriter.write(doc).getvalue()
#print contents
return contents
开发者ID:danielhfrank,项目名称:Tumbox,代码行数:9,代码来源:tumbox.py
示例11: parse
def parse(self, path):
# Directory
if os.path.isdir(path):
raise NotImplementedError()
# File
else:
doc = Rtf15Reader.read(open(path))
sample = Sample(path, None, PlaintextWriter.write(doc).getvalue())
return sample
开发者ID:hcouch21,项目名称:styloproject,代码行数:9,代码来源:RtfParser.py
示例12: test_read2
def test_read2(self):
rtf = StringIO("""{\\rtf1\\ansi\\ansicpg1252\\cocoartf1343\\cocoasubrtf160\\cocoascreenfonts1{\\fonttbl\\f0\\fnil\\fcharset222 Thonburi;}
{\\colortbl;\\red255\\green255\\blue255;}
\\pard\\tx560\\tx1120\\tx1680\\tx2240\\tx2800\\tx3360\\tx3920\\tx4480\\tx5040\\tx5600\\tx6160\\tx6720\\pardirnatural\\qc
{\\f0\\fs24 \\cf0 \\'b9\\'e9\\'d3\\'b5\\'a1}""")
doc = Rtf15Reader.read(rtf)
text = PlaintextWriter.write(doc).read()
print text
self.assertEquals(u"น้ำตก", text.decode('utf8'))
开发者ID:pphetra,项目名称:pyth,代码行数:10,代码来源:test_readosxrtf.py
示例13: clean_rtf
def clean_rtf(fname):
doc = Rtf15Reader.read(open(fname))
plain = PlaintextWriter.write(doc).getvalue()
lines = plain.split("\n")
# print '#############################\norig: %s' % pprint.pformat(lines[:10])
lines = filter(lambda l: len(l) > 0, lines)
# print "##############################\nno blank lines:\t%s" % pprint.pformat(lines[:10])
lines = [line.split(";") for line in lines]
lines = [[val[1:-1] for val in line] for line in lines]
# print "##############################\nsplit lines:\t%s" % pprint.pformat(lines[:10])
return lines
开发者ID:embr,项目名称:nonce,代码行数:11,代码来源:upload_jami_rtf.py
示例14: _rtf_to_txt
def _rtf_to_txt(file_path, dst_dir, file_name):
"""
Uses the pyth python module to extract text from a rtf file and save
to .txt in dst_dir.
"""
if file_name is None:
file_name = os.path.split(file_path)[1]
file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name))
doc = Rtf15Reader.read(open(file_path))
txt = PlaintextWriter.write(doc).getvalue()
txt = unidecode(txt)
with open(file_dst, 'w') as f:
f.write(txt)
return 0
开发者ID:rjweiss,项目名称:rosetta,代码行数:14,代码来源:converters.py
示例15: _convert_rtf_to_text
def _convert_rtf_to_text(self, password=None):
input_rtf = self.cvFile
rtf = Rtf15Reader.read(open(input_rtf))
outputPath = self.scratchDir
inputPath = os.getcwd()
if os.path.exists(input_rtf):
inputPath = os.path.dirname(input_rtf)
input_filename = os.path.basename(input_rtf)
input_parts = input_filename.split(".")
input_parts.pop()
randomStr = int(time.time())
output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
self.cvTextFile = output_filename
fw = open(self.cvTextFile, "w")
fw.write(PlaintextWriter.write(rtf).getvalue())
fw.close()
return (0)
开发者ID:arshpreetsingh,项目名称:cv-parser,代码行数:17,代码来源:cvparser.py
示例16: loadAllRTFToDB
def loadAllRTFToDB(folderPath):
db = DBController()
for dirPath, dirNames, fileNames in os.walk(folderPath):
for fileName in fileNames:
if not fileName.endswith('.rtf'):
continue
filePath = os.path.join(dirPath, fileName)
print(filePath)
try:
doc = Rtf15Reader.read(open(filePath))
text = PlaintextWriter.write(doc).getvalue()
except:
continue
lines = [line.strip() for line in text.split('\n') if line]
articleLinesDict, articleStartIndex = {}, 0
for i, line in enumerate(lines):
if line.startswith('Document ') and len(line.split(' ')) == 2:
articleId = line.split(' ')[-1]
articleLinesDict[articleId] = lines[articleStartIndex : i]
articleStartIndex = i + 1
for articleId, lines in articleLinesDict.iteritems():
bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1
for i, line in enumerate(lines):
line = line.lower()
if line.startswith('by '):
bylineIndex = i
elif line.endswith(' words'):
wordCountIndex = i
elif line == 'english':
textStartIndex = i + 2
if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex:
print(filePath + ', ' + articleId)
else:
articleDict = {'_id': articleId,
'filePath' : filePath.split('Marshall_RA/')[-1],
'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]),
'byline' : '' if bylineIndex == -1 else lines[bylineIndex],
'date' : parser.parse(lines[wordCountIndex + 1]),
'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3],
'leadParagraph' : '',
'tailParagraph' : '\n'.join(lines[textStartIndex:]),
'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []}
db.saveArticle(articleDict)
开发者ID:exsonic,项目名称:CorpusAnalysis,代码行数:45,代码来源:FileUtils.py
示例17: documentToText
def documentToText(path):
if path[-4:] == ".doc":
cmd = ['antiword', path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return removeNonAscii(stdout)
elif path[-5:] == ".docx":
return removeNonAscii(doc.process(path))
elif path[-4:] == ".txt":
inputFile = open(path)
text = inputFile.read() #Because memory and such
inputFile.close()
return(removeNonAscii(text))
elif path[-4:] == ".pdf":
return removeNonAscii(convert_pdf_to_txt(path))
elif path[-4:] == ".rtf":
text = Rtf15Reader.read(open(path))
return removeNonAscii(PlaintextWriter.write(text).getvalue())
return "Returned Nothing."
开发者ID:zcatbear,项目名称:Expelliarmus,代码行数:19,代码来源:expelliarmus.py
示例18: Run
def Run(journal_file):
raw_entries = plistlib.readPlist(journal_file)
acc = utils.EntryAccumulator(lambda x: x['date'])
for k, v in raw_entries.iteritems():
if not v: continue
# 12/29/2001 -> 2001-12-29
new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k)
d = parser.parse(new_k)
if isinstance(v, plistlib.Data):
f = StringIO.StringIO(v.data)
try:
doc = Rtf15Reader.read(f)
except ValueError as e:
print v.data
raise e
txt = PlaintextWriter.write(doc).getvalue()
acc.add({
'date': d,
'rtf': v.data,
'text': txt
})
else:
acc.add({
'date': d,
'text': v
})
for day, entries in acc.iteritems():
assert len(entries) == 1
entry = entries[0]
if not entry['text']:
continue
summary = utils.SummarizeText(entry['text'])
utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run)
if 'rtf' in entry:
utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run)
else:
utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
开发者ID:danvk,项目名称:personal-archive,代码行数:42,代码来源:import_osxjournal.py
示例19: upload_file
def upload_file(request):
error_message = ""
if request.method == "POST":
form = UploadForm(request.POST, request.FILES)
if form.is_valid():
doc_name = UploadedFile(request.FILES["doc_file"])
doc_uploaded_date = timezone.now()
doc = request.FILES["doc_file"]
if get_file_type(doc_name) == ".rtf":
result = Rtf15Reader.read(doc)
parser = LawHtmlParser(PlaintextWriter.write(result).read())
elif get_file_type(doc_name) == ".txt":
parser = LawHtmlParser(doc.read())
parsed_doc_content = parser.get_parsed_text()
new_doc = Document(name=doc_name, content=parsed_doc_content, uploaded_date=doc_uploaded_date, file=doc)
new_doc.save()
return HttpResponseRedirect(reverse("document:list"))
else:
error_message = "Please select a file."
form = UploadForm()
return render(request, "document/upload.html", {"form": form, "error_message": error_message})
开发者ID:vteremasov,项目名称:zakon,代码行数:23,代码来源:views.py
示例20: document_create_index
def document_create_index(document, user_id=None):
import os
from xlrd import open_workbook
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.plaintext.writer import PlaintextWriter
import sunburnt
document = json.loads(document)
table = s3db.doc_document
id = document["id"]
name = document["name"]
filename = document["filename"]
filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \
request.application, filename)
si = sunburnt.SolrInterface(settings.get_base_solr_url())
extension = os.path.splitext(filename)[1][1:]
if extension == "pdf":
data = os.popen("pdf2txt.py " + filename).read()
elif extension == "doc":
data = os.popen("antiword " + filename).read()
elif extension == "xls":
wb = open_workbook(filename)
data=" "
for s in wb.sheets():
for row in range(s.nrows):
values = []
for col in range(s.ncols):
values.append(str(s.cell(row, col).value))
data = data + ",".join(values) + "\n"
elif extension == "rtf":
doct = Rtf15Reader.read(open(filename))
data = PlaintextWriter.write(doct).getvalue()
else:
data = os.popen("strings " + filename).read()
# The text needs to be in unicode or ascii, with no contol characters
data = str(unicode(data, errors="ignore"))
data = "".join(c if ord(c) >= 32 else " " for c in data)
# Put the data according to the Multiple Fields
# @ToDo: Also, would change this according to requirement of Eden
document = {"id": str(id), # doc_document.id
"name": data, # the data of the file
"url": filename, # the encoded file name stored in uploads/
"filename": name, # the filename actually uploaded by the user
"filetype": extension # x.pdf -> pdf is the extension of the file
}
# Add and commit Indices
si.add(document)
si.commit()
# After Indexing, set the value for has_been_indexed to True in the database
db(table.id == id).update(has_been_indexed = True)
db.commit()
开发者ID:gnarula,项目名称:eden_deployment,代码行数:61,代码来源:tasks.py
注:本文中的pyth.plugins.plaintext.writer.PlaintextWriter类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论