本文整理汇总了Python中regparser.tree.xml_parser.tree_utils.get_node_text函数的典型用法代码示例。如果您正苦于以下问题:Python get_node_text函数的具体用法?Python get_node_text怎么用?Python get_node_text使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了get_node_text函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: table_xml_to_data
def table_xml_to_data(xml_node):
"""Construct a data structure of the table data. We provide a different
structure than the native XML as the XML encodes too much logic. This
structure can be used to generate semi-complex tables which could not be
generated from the markdown above"""
header_root = build_header(xml_node.xpath('./BOXHD/CHED'))
header = [[] for _ in range(header_root.height())]
def per_node(node):
header[node.level].append({'text': node.text,
'colspan': node.colspan,
'rowspan': node.rowspan})
struct.walk(header_root, per_node)
header = header[1:] # skip the root
rows = []
for row in xml_node.xpath('./ROW'):
rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
for td in row.xpath('./ENT')])
table_data = {'header': header, 'rows': rows}
caption_nodes = xml_node.xpath('./TTITLE')
if len(caption_nodes):
text = tree_utils.get_node_text(caption_nodes[0]).strip()
table_data["caption"] = text
return table_data
开发者ID:vrajmohan,项目名称:regulations-parser,代码行数:28,代码来源:formatting.py
示例2: test_get_node_text
def test_get_node_text(self):
text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc)
self.assertEquals('(a)Fruit.Apps, and pins', result)
text = '<P>(a)<E T="03">Fruit.</E>Apps,<PRTPAGE P="102"/> and pins</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) Fruit. Apps, and pins', result)
text = '<P>(a) <E T="03">Fruit.</E> Apps, and pins</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) Fruit. Apps, and pins', result)
text = '<P>(a) ABC<E T="52">123</E>= 5</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) ABC_{123} = 5', result)
text = '<P>(a) <E>Keyterm.</E> ABC<E T="52">123</E>= 5</P>'
doc = etree.fromstring(text)
result = tree_utils.get_node_text(doc, add_spaces=True)
self.assertEquals('(a) Keyterm. ABC_{123} = 5', result)
开发者ID:EricSchles,项目名称:regulations-parser,代码行数:28,代码来源:tree_utils_tests.py
示例3: table_xml_to_plaintext
def table_xml_to_plaintext(xml_node):
"""Markdown representation of a table. Note that this doesn't account
for all the options needed to display the table properly, but works fine
for simple tables. This gets included in the reg plain text"""
header = [tree_utils.get_node_text(hd, add_spaces=True).strip()
for hd in xml_node.xpath('./BOXHD/CHED|./TTITLE')]
divider = ['---']*len(header)
rows = []
for tr in xml_node.xpath('./ROW'):
rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
for td in tr.xpath('./ENT')])
table = []
for row in [header] + [divider] + rows:
table.append('|' + '|'.join(row) + '|')
return '\n'.join(table)
开发者ID:vrajmohan,项目名称:regulations-parser,代码行数:15,代码来源:formatting.py
示例4: build_header
def build_header(xml_nodes):
"""Builds a TableHeaderNode tree, with an empty root. Each node in the tree
includes its colspan/rowspan"""
stack = HeaderStack()
stack.add(0, TableHeaderNode(None, 0)) # Root
for xml_node in xml_nodes:
level = int(xml_node.attrib['H'])
text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
stack.add(level, TableHeaderNode(text, level))
while stack.size() > 1:
stack.unwind()
root = stack.m_stack[0][0][1]
max_height = root.height()
def set_rowspan(n):
n.rowspan = max_height - n.height() - n.level + 1
struct.walk(root, set_rowspan)
def set_colspan(n):
n.colspan = n.width()
struct.walk(root, set_colspan)
return root
开发者ID:adderall,项目名称:regulations-parser,代码行数:25,代码来源:formatting.py
示例5: parse_amdpar
def parse_amdpar(par, initial_context):
""" Parse the <AMDPAR> tags into a list of paragraphs that have changed.
"""
# Replace and "and"s in titles; they will throw off and_token_resolution
for e in filter(lambda e: e.text, par.xpath('./E')):
e.text = e.text.replace(' and ', ' ')
text = get_node_text(par, add_spaces=True)
tokenized = [t[0] for t, _, _ in amdpar.token_patterns.scanString(text)]
tokenized = compress_context_in_tokenlists(tokenized)
tokenized = resolve_confused_context(tokenized, initial_context)
tokenized = paragraph_in_context_moved(tokenized, initial_context)
tokenized = remove_false_deletes(tokenized, text)
tokenized = multiple_moves(tokenized)
tokenized = switch_passive(tokenized)
tokenized = and_token_resolution(tokenized)
tokenized, subpart = deal_with_subpart_adds(tokenized)
tokenized = context_to_paragraph(tokenized)
tokenized = move_then_modify(tokenized)
if not subpart:
tokenized = separate_tokenlist(tokenized)
initial_context = switch_context(tokenized, initial_context)
tokenized, final_context = compress_context(tokenized, initial_context)
amends = make_amendments(tokenized, subpart)
return amends, final_context
开发者ID:EricSchles,项目名称:regulations-parser,代码行数:26,代码来源:diff.py
示例6: nodes_from_interp_p
def nodes_from_interp_p(xml_node):
"""Given an XML node that contains text for an interpretation paragraph,
split it into sub-paragraphs and account for trailing stars"""
node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
first_marker = get_first_interp_marker(text_with_tags)
collapsed = collapsed_markers_matches(node_text, text_with_tags)
# -2 throughout to account for matching the character + period
ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)]
starts = [m.end() - 2 for m in collapsed] + [len(node_text)]
# Node for this paragraph
n = Node(node_text[0:starts[0]], label=[first_marker],
node_type=Node.INTERP, tagged_text=text_with_tags)
yield n
if n.text.endswith('* * *'):
yield Node(label=[mtypes.INLINE_STARS])
# Collapsed-marker children
for match, end in zip(collapsed, ends):
marker = match.group(1)
if marker == '1':
marker = '<E T="03">1</E>'
n = Node(node_text[match.end() - 2:end], label=[marker],
node_type=Node.INTERP)
yield n
if n.text.endswith('* * *'):
yield Node(label=[mtypes.INLINE_STARS])
开发者ID:eregs,项目名称:regulations-parser,代码行数:29,代码来源:gpo_cfr.py
示例7: process_inner_children
def process_inner_children(inner_stack, xml_node):
"""Process the following nodes as children of this interpretation. This
is very similar to reg_text.py:build_from_section()"""
children = itertools.takewhile(
lambda x: not is_title(x), xml_node.itersiblings())
nodes = []
for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children):
node_text = tree_utils.get_node_text(xml_node, add_spaces=True)
text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node)
first_marker = get_first_interp_marker(text_with_tags)
if xml_node.tag == 'STARS':
nodes.append(Node(label=[mtypes.STARS_TAG]))
elif not first_marker and nodes:
logger.warning("Couldn't determine interp marker. Appending to "
"previous paragraph: %s", node_text)
previous = nodes[-1]
previous.text += "\n\n" + node_text
if previous.tagged_text:
previous.tagged_text += "\n\n" + text_with_tags
else:
previous.tagged_text = text_with_tags
else:
nodes.extend(nodes_from_interp_p(xml_node))
# Trailing stars don't matter; slightly more efficient to ignore them
while nodes and nodes[-1].label[0] in mtypes.stars:
nodes = nodes[:-1]
add_nodes_to_stack(nodes, inner_stack)
开发者ID:eregs,项目名称:regulations-parser,代码行数:29,代码来源:gpo_cfr.py
示例8: derive_nodes
def derive_nodes(self, xml, processor=None):
texts = ["```" + self.fence_type(xml)]
for child in xml:
texts.append(tree_utils.get_node_text(child).strip())
texts.append("```")
return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
开发者ID:theresaanna,项目名称:regulations-parser,代码行数:7,代码来源:paragraph_processor.py
示例9: make_authority_instructions
def make_authority_instructions(auth_xml, cfr_part):
"""Creates an `EREGS_INSTRUCTIONS` element specific to the authority
information"""
instructions = etree.Element('EREGS_INSTRUCTIONS')
authority = etree.SubElement(instructions, 'AUTHORITY', label=cfr_part)
authority.text = '\n'.join(get_node_text(p, add_spaces=True)
for p in auth_xml.xpath('./P'))
return instructions
开发者ID:tadhg-ohiggins,项目名称:regulations-parser,代码行数:8,代码来源:amdparser.py
示例10: process
def process(self, appendix, part):
self.m_stack = tree_utils.NodeStack()
self.part = part
self.paragraph_count = 0
self.header_count = 0
self.depth = None
self.appendix_letter = None
# holds collections of nodes until their depth is determined
self.nodes = []
self.set_letter(appendix)
remove_toc(appendix, self.appendix_letter)
def is_subhead(tag, text):
initial = initial_marker(text)
return ((tag == 'HD' and (not initial or '.' in initial[1]))
or (tag in ('P', 'FP')
and title_label_pair(text, self.appendix_letter,
self.part)))
for child in appendix.getchildren():
text = tree_utils.get_node_text(child, add_spaces=True).strip()
if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
or child.tag == 'RESERVED'):
self.end_group()
self.hed(part, text)
elif is_subhead(child.tag, text):
self.end_group()
self.subheader(child, text)
elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
text = self.insert_dashes(child, text)
self.paragraph_with_marker(
text,
tree_utils.get_node_text_tags_preserved(child))
elif child.tag == 'SEQUENCE':
old_depth = self.depth
self.end_group()
self.depth = old_depth
self.process_sequence(child)
elif child.tag in ('P', 'FP'):
text = self.insert_dashes(child, text)
self.paragraph_no_marker(text)
elif child.tag == 'GPH':
self.graphic(child)
elif child.tag == 'GPOTABLE':
self.table(child)
elif child.tag in ('NOTE', 'NOTES'):
self.fence(child, 'note')
elif child.tag == 'CODE':
self.fence(child, child.get('LANGUAGE', 'code'))
self.end_group()
while self.m_stack.size() > 1:
self.m_stack.unwind()
if self.m_stack.m_stack[0]:
return self.m_stack.m_stack[0][0][1]
开发者ID:cfpb,项目名称:regulations-parser,代码行数:58,代码来源:appendices.py
示例11: process_appendix
def process_appendix(m_stack, current_section, child):
html_parser = HTMLParser.HTMLParser()
for ch in child.getchildren():
if ch.tag == 'HD':
appendix_section = get_appendix_section_number(
ch.text, current_section)
if appendix_section is None:
appendix_section = determine_next_section(m_stack, 2)
n = Node(
node_type=Node.APPENDIX, label=[appendix_section],
title=ch.text)
node_level = 2
tree_utils.add_to_stack(m_stack, node_level, n)
if ch.tag == 'P':
text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
markers_list = tree_utils.get_paragraph_markers(text)
node_text = tree_utils.get_node_text(ch)
if len(markers_list) > 0:
if len(markers_list) > 1:
actual_markers = ['(%s)' % m for m in markers_list]
node_text = tree_utils.split_text(
node_text, actual_markers)
else:
node_text = [node_text]
for m, node_text in zip(markers_list, node_text):
n = Node(
node_text, label=[str(m)], node_type=Node.APPENDIX)
last = m_stack.peek()
node_level = determine_level(m, last[0][0])
if m == 'i':
#This is bit of a hack, since we can't easily
#distinguish between the Roman numeral #(i) and the
#letter (i) to determine the level. We look ahead to
#help. This is not #a complete solution and we should
#circle back at some point.
next_text = ' '.join(
[ch.getnext().text] +
[c.tail for c in ch.getnext() if c.tail])
next_markers = tree_utils.get_paragraph_markers(
next_text)
if next_markers[0] == 'ii':
node_level = 5
tree_utils.add_to_stack(m_stack, node_level, n)
else:
last = m_stack.peek_last()
last[1].text = last[1].text + '\n %s' % node_text
开发者ID:dclegalhackers,项目名称:regulations-parser,代码行数:58,代码来源:appendices.py
示例12: set_letter
def set_letter(self, appendix):
"""Find (and set) the appendix letter"""
for hd in appendix_headers(appendix):
text = tree_utils.get_node_text(hd)
if self.appendix_letter:
logger.warning("Found two appendix headers: %s and %s",
self.appendix_letter, text)
self.appendix_letter = grammar.headers.parseString(text).appendix
return self.appendix_letter
开发者ID:eregs,项目名称:regulations-parser,代码行数:9,代码来源:appendices.py
示例13: derive_nodes
def derive_nodes(self, xml, processor=None):
texts = ["```" + xml.get('LANGUAGE', 'code')]
for child in xml:
text = tree_utils.get_node_text(child).strip()
if text:
texts.append(text)
texts.append("```")
return [Node("\n".join(texts), label=[mtypes.MARKERLESS])]
开发者ID:vrajmohan,项目名称:regulations-parser,代码行数:9,代码来源:paragraph_processor.py
示例14: process
def process(self, appendix, part):
self.m_stack = tree_utils.NodeStack()
self.paragraph_count = 0
self.header_count = 0
self.depth = None
self.appendix_letter = None
self.set_letter(appendix)
remove_toc(appendix, self.appendix_letter)
def is_subhead(tag, text):
initial = initial_marker(text)
return ((tag == 'HD' and (not initial or '.' in initial[1]))
or (tag in ('P', 'FP')
and title_label_pair(text, self.appendix_letter)))
for child in appendix.getchildren():
text = tree_utils.get_node_text(child, add_spaces=True).strip()
if ((child.tag == 'HD' and child.attrib['SOURCE'] == 'HED')
or child.tag == 'RESERVED'):
self.hed(part, text)
elif is_subhead(child.tag, text):
self.subheader(child, text)
elif initial_marker(text) and child.tag in ('P', 'FP', 'HD'):
if child.getnext() is None:
next_text = ''
else:
next_text = self.find_next_text_with_marker(
child.getnext()) or ''
texts = self.split_paragraph_text(text, next_text)
for text, next_text in zip(texts, texts[1:]):
self.paragraph_with_marker(text, next_text)
elif child.tag in ('P', 'FP'):
self.paragraph_no_marker(text)
elif child.tag == 'GPH':
self.graphic(child)
elif child.tag == 'GPOTABLE':
self.table(child)
elif child.tag in ('NOTE', 'NOTES'):
self.fence(child, 'note')
elif child.tag == 'CODE':
self.fence(child, child.get('LANGUAGE', 'code'))
while self.m_stack.size() > 1:
self.m_stack.unwind()
if self.m_stack.m_stack[0]:
root = self.m_stack.m_stack[0][0][1]
def per_node(n):
if hasattr(n, 'p_level'):
del n.p_level
walk(root, per_node)
return root
开发者ID:khandelwal,项目名称:regulations-parser,代码行数:56,代码来源:appendices.py
示例15: set_letter
def set_letter(self, appendix):
"""Find (and set) the appendix letter"""
for node in (c for c in appendix.getchildren()
if is_appendix_header(c)):
text = tree_utils.get_node_text(node)
if self.appendix_letter:
logging.warning("Found two appendix headers: %s and %s",
self.appendix_letter, text)
self.appendix_letter = headers.parseString(text).appendix
return self.appendix_letter
开发者ID:khandelwal,项目名称:regulations-parser,代码行数:10,代码来源:appendices.py
示例16: derive_nodes
def derive_nodes(self, xml, processor):
"""Finds and deletes the category header before recursing. Adds this
header as a title."""
xml = deepcopy(xml) # we'll be modifying this
header = xml.xpath('./HD')[0]
xml.remove(header)
header_text = tree_utils.get_node_text(header)
node = Node(title=header_text, label=[self.marker(header_text)])
return [processor.process(xml, node)]
开发者ID:anthonygarvan,项目名称:regulations-parser,代码行数:10,代码来源:import_category.py
示例17: fence
def fence(self, xml_node, fence_type):
"""Use github-like fencing to indicate this is a note or code"""
self.paragraph_counter += 1
texts = ["```" + fence_type]
for child in xml_node:
texts.append(tree_utils.get_node_text(child).strip())
texts.append("```")
n = Node("\n".join(texts), node_type=Node.APPENDIX,
label=['p' + str(self.paragraph_counter)],
source_xml=xml_node)
self.nodes.append(n)
开发者ID:EricSchles,项目名称:regulations-parser,代码行数:11,代码来源:appendices.py
示例18: process_sequence
def process_sequence(self, root):
for child in root.getchildren():
text = tree_utils.get_node_text(child, add_spaces=True).strip()
text = self.insert_dashes(child, text)
self.paragraph_with_marker(
text, tree_utils.get_node_text_tags_preserved(child))
old_depth = self.depth
self.depth += 1
self.end_group()
self.depth = old_depth
开发者ID:cfpb,项目名称:regulations-parser,代码行数:11,代码来源:appendices.py
示例19: test_appendix_headers
def test_appendix_headers():
with XMLBuilder('APPENDIX') as ctx:
ctx.EAR('1')
ctx.HD('2', SOURCE='HED')
ctx.P('3')
ctx.HD('4', SOURCE='HD1')
ctx.GPH('5')
ctx.RESERVED('6')
with ctx.WHED():
ctx.E('7')
headers = [get_node_text(h) for h in appendices.appendix_headers(ctx.xml)]
assert headers == ['2', '6', '7']
开发者ID:eregs,项目名称:regulations-parser,代码行数:12,代码来源:appendices_tests.py
示例20: parse_from_xml
def parse_from_xml(root, xml_nodes):
"""Core of supplement processing; shared by whole XML parsing and notice
parsing. root is the root interpretation node (e.g. a Node with label
'1005-Interp'). xml_nodes contains all XML nodes which will be relevant
to the interpretations"""
supplement_nodes = [root]
last_label = root.label
header_count = 0
for ch in xml_nodes:
node = Node(label=last_label, node_type=Node.INTERP)
label_obj = Label.from_node(node)
# Explicitly ignore "subpart" headers, as they are inconsistent
# and they will be reconstructed as subterps client-side
text = tree_utils.get_node_text(ch, add_spaces=True)
if is_title(ch) and 'subpart' not in text.lower():
labels = text_to_labels(text, label_obj)
if labels:
label = merge_labels(labels)
else: # Header without a label, like an Introduction, etc.
header_count += 1
label = root.label[:2] + ['h%d' % header_count]
inner_stack = tree_utils.NodeStack()
missing = missing_levels(last_label, label)
supplement_nodes.extend(missing)
last_label = label
node = Node(node_type=Node.INTERP, label=label,
title=text.strip())
inner_stack.add(2, node)
process_inner_children(inner_stack, ch, parent=node)
while inner_stack.size() > 1:
inner_stack.unwind()
ch_node = inner_stack.m_stack[0][0][1]
supplement_nodes.append(ch_node)
supplement_tree = treeify(supplement_nodes)
def per_node(node):
node.label = [l.replace('<E T="03">', '') for l in node.label]
for child in node.children:
per_node(child)
for node in supplement_tree:
per_node(node)
return supplement_tree[0]
开发者ID:cfpb,项目名称:regulations-parser,代码行数:52,代码来源:interpretations.py
注:本文中的regparser.tree.xml_parser.tree_utils.get_node_text函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论