• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python api.CorpusReader类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.corpus.reader.api.CorpusReader的典型用法代码示例。如果您正苦于以下问题:Python CorpusReader类的具体用法?Python CorpusReader怎么用?Python CorpusReader使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了CorpusReader类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __init__

    def __init__(
        self,
        root,
        fileids,
        sep="/",
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding="latin1",
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
开发者ID:Reagankm,项目名称:KnockKnock,代码行数:25,代码来源:aligned.py


示例2: __init__

    def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
                 target_language=None, paragraph_separator='\n\n', **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param target_language: which files to select; sometimes a corpus contains English
         translations, we expect these files to be named ...english.json -- if not, pass in fileids
        :param paragraph_separator: character sequence demarcating paragraph separation
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """

        if not target_language:
            target_language = ''
        if not fileids:
            fileids = r'.*{}\.json'.format(target_language)

        # Initialize the NLTK corpus reader objects
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
        self.paragraph_separator = paragraph_separator
开发者ID:diyclassics,项目名称:cltk,代码行数:28,代码来源:readers.py


示例3: __init__

 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
开发者ID:DrDub,项目名称:nltk,代码行数:8,代码来源:knbc.py


示例4: __init__

 def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
     """
     Initialize the corpus reader.  Categorization arguments
     (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
     the ``CategorizedCorpusReader`` constructor.  The remaining
     arguments are passed to the ``CorpusReader`` constructor.
     """
     CorpusReader.__init__(self, root, fileids)
开发者ID:yokeyong,项目名称:atap,代码行数:8,代码来源:am_reader.py


示例5: __init__

 def __init__(self, root, fileids, tone, tag, wrap_etree=False):
     self.fileids = fileids
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
     self.tagged_sents = []
     self.sents = []
     self.words = []
     self.tagged_words = []
     self.option_tone = tone
     self.option_tag = tag
开发者ID:Batene,项目名称:Bamanankan,代码行数:10,代码来源:htmlreaderALL.py


示例6: __init__

 def __init__(self, root, fileids,
              syntax_parser=CaboChaParser(),
              word_tokenizer=MeCabTokenizer(),
              sent_tokenizer=jp_sent_tokenizer,
              case_parser=KNPParser(),
              encoding='utf-8'):
   CorpusReader.__init__(self, root, fileids, encoding)
   self._syntax_parser = syntax_parser
   self._word_tokenizer = word_tokenizer
   self._sent_tokenizer = sent_tokenizer
   self._case_parser = case_parser
开发者ID:miyamofigo,项目名称:Japanese-corpus-and-utility,代码行数:11,代码来源:corpus.py


示例7: __init__

 def __init__(self, root, zipfile, fileids):
     if isinstance(root, basestring):
         root = FileSystemPathPointer(root)
     elif not isinstance(root, PathPointer): 
         raise TypeError('CorpusReader: expected a string or a PathPointer')
     
     # convert to a ZipFilePathPointer
     root = ZipFilePathPointer(root.join(zipfile))
     
     CorpusReader.__init__(self, root, fileids)
     
     self._parse_char_replacements()
开发者ID:IMAmuseum,项目名称:getty-vocab-reconciliation,代码行数:12,代码来源:getty.py


示例8: __init__

    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
开发者ID:yokeyong,项目名称:atap,代码行数:13,代码来源:reader.py


示例9: __init__

 def __init__(self, root, fileids, 
              sep='/', word_tokenizer=WhitespaceTokenizer(),
              sent_tokenizer=RegexpTokenizer('\n', gaps=True),
              encoding=None):
     """
     @param root: The root directory for this corpus.
     @param fileids: A list or regexp specifying the fileids in this corpus.
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self._sep = sep
     self._word_tokenizer = word_tokenizer
     self._sent_tokenizer = sent_tokenizer
     self._alignedsent_block_reader=None,
     self._alignedsent_block_reader = self._alignedsent_block_reader
     self._alignedsent_corpus_view = None
开发者ID:yochananmkp,项目名称:clir,代码行数:15,代码来源:aligned_corpus_reader.py


示例10: assemble_corpus

def assemble_corpus(corpus_reader: CorpusReader,
                    types_requested: List[str],
                    type_dirs: Dict[str, List[str]] = None,
                    type_files: Dict[str, List[str]] = None) -> CorpusReader:
    """
    Create a filtered corpus.
    :param corpus_reader: This get mutated
    :param types_requested: a list of string types, which are to be found in the type_dirs and
    type_files mappings
    :param type_dirs: a dict of corpus types to directories
    :param type_files: a dict of corpus types to files
    :return: a CorpusReader object containing only the mappings desired
    """
    fileid_names = []  # type: List[str]
    try:
        all_file_ids = list(corpus_reader.fileids())
        clean_ids_types = []  # type: List[Tuple[str, str]]
        if type_files:
            for key, valuelist in type_files.items():
                if key in types_requested:
                    for value in valuelist:
                        if value in all_file_ids:
                            if key:
                                clean_ids_types.append((value, key))
        if type_dirs:
            for key, valuelist in type_dirs.items():
                if key in types_requested:
                    for value in valuelist:
                        corrected_dir = value.replace('./', '')
                        corrected_dir = '{}/'.format(corrected_dir)
                        for name in all_file_ids:
                            if name and name.startswith(corrected_dir):
                                clean_ids_types.append((name, key))
        clean_ids_types.sort(key=lambda x: x[0])
        fileid_names, categories = zip(*clean_ids_types)  # type: ignore
        corpus_reader._fileids = fileid_names
        return corpus_reader
    except Exception:
        LOG.exception('failure in corpus building')
开发者ID:diyclassics,项目名称:cltk,代码行数:39,代码来源:readers.py


示例11: __init__

    def __init__(self, root, fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:23,代码来源:twitter.py


示例12: __init__

    def __init__(self,
                 root,
                 fileids,
                 column_types=None,
                 top_node='S',
                 beginning_of_sentence=r'#BOS.+$',
                 end_of_sentence=r'#EOS.+$',
                 encoding=None):
        """ Construct a new corpus reader for reading NEGRA corpus files.
        @param root: The root directory of the corpus files.
        @param fileids: A list of or regex specifying the files to read from.
        @param column_types: An optional C{list} of columns in the corpus.
        @param top_node: The top node of parsed sentence trees.
        @param beginning_of_sentence: A regex specifying the start of a sentence
        @param end_of_sentence: A regex specifying the end of a sentence
        @param encoding: The default corpus file encoding.
        """

        # Make sure there are no invalid column type
        if isinstance(column_types, list):
            for column_type in column_types:
                if column_type not in self.COLUMN_TYPES:
                    raise ValueError("Column %r is not supported." % columntype)
        else:
            column_types = self.COLUMN_TYPES

        # Define stuff
        self._top_node = top_node
        self._column_types = column_types
        self._fileids = fileids
        self._bos = beginning_of_sentence
        self._eos = end_of_sentence
        self._colmap = dict((c,i) for (i,c) in enumerate(column_types))

        # Finish constructing by calling the extended class' constructor
        CorpusReader.__init__(self, root, fileids, encoding)
开发者ID:wroberts,项目名称:NLTK-Contributions,代码行数:36,代码来源:NegraCorpusReader.py


示例13: fileids

 def fileids(self, channels=None, domains=None, categories=None):
     if channels is not None and domains is not None and \
             categories is not None:
         raise ValueError('You can specify only one of channels, domains '
                          'and categories parameter at once')
     if channels is None and domains is None and \
             categories is None:
         return CorpusReader.fileids(self)
     if isinstance(channels, basestring):
         channels = [channels]
     if isinstance(domains, basestring):
         domains = [domains]
     if isinstance(categories, basestring):
         categories = [categories]
     if channels:
         return self._list_morph_files_by('channel', channels)
     elif domains:
         return self._list_morph_files_by('domain', domains)
     else:
         return self._list_morph_files_by('keyTerm', categories,
                 map=self._map_category)
开发者ID:B-Rich,项目名称:Fem-Coding-Challenge,代码行数:21,代码来源:ipipan.py


示例14: __init__

 def __init__(self, root, fileids):
     CorpusReader.__init__(self, root, fileids, None, None)
开发者ID:B-Rich,项目名称:Fem-Coding-Challenge,代码行数:2,代码来源:ipipan.py


示例15: __init__

 def __init__(self, root, fileids, wrap_etree=False):
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
开发者ID:NavinManaswi,项目名称:nltk,代码行数:3,代码来源:xmldocs.py



注:本文中的nltk.corpus.reader.api.CorpusReader类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python util.concat函数代码示例发布时间:2022-05-27
下一篇:
Python reader.ChunkedCorpusReader类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap