• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python pymystem3.Mystem类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pymystem3.Mystem的典型用法代码示例。如果您正苦于以下问题:Python Mystem类的具体用法?Python Mystem怎么用?Python Mystem使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Mystem类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __init__

    def __init__(self, path):

        self.text = open(path).read().lower()
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1]
        self.pos_data = []

        m = Mystem()
        counter = [0, 0, 0, 0, 0]

        for sentence in self.sentences:

            # parse with mystem
            # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
            data = m.analyze(sentence)
            for word in data:
                analysis = word.get('analysis', None)
                if analysis:
                    best = analysis[0]
                    gr = best['gr']
                    if 'S' in gr:
                        counter[3] += 1
                    elif 'ADV' in gr:
                        counter[1] += 1
                    elif 'A' in gr:
                        counter[0] += 1
                    elif 'V' in gr:
                        counter[4] += 1
                    elif 'PR' in gr:
                        counter[2] += 1

            self.pos_data.append(counter)
            counter = [0, 0, 0, 0, 0]

        self.data = np.array(self.pos_data)
开发者ID:Sereni,项目名称:assignments,代码行数:34,代码来源:genre_by_pos.py


示例2: mystem_using_with_considering_of_multiple_letters

def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory):
        input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
        output_data = {}
        m = Mystem()
        #иду по документам
        for input_file in input_files:
            with open(input_directory + '/' + input_file) as data_file:
                data = json.load(data_file)
            list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
            my_list_of_terms = []
            for term in list_of_terms:
                if term == m.lemmatize(term)[0]:
                    my_term = term
                    term = u''
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                else:
                    my_list_of_terms.append(term)
            list_of_terms = my_list_of_terms
            text = ' '.join(['%s' % term for term in list_of_terms])
            list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
            text_of_output = ' '.join(['%s' % term for term in list_of_terms])
            output_data[input_file] = {}
            output_data[input_file]['id'] = data['id']
            output_data[input_file]['positive'] = data['positive']
            output_data[input_file]['sarcasm'] = data['sarcasm']
            output_data[input_file]['text'] = text_of_output
            with open(output_directory + '/' + input_file, 'w') as output_file:
                json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:34,代码来源:features.py


示例3: without_pronouns

def without_pronouns(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
        my_list = list_of_terms
        list_of_terms = []
        for term in my_list:
            if m.analyze(term)[0].get(u'analysis'):
                if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')):
                    list_of_terms.append(term)
            else:
                list_of_terms.append(term)
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:27,代码来源:features.py


示例4: __init__

    def __init__(self, path, doc_id, limit):
        """
        :param doc_id: numerical id of a document, pass manually
        """

        self.text = open(path).read().lower().replace('\n', '.')
        # need a better regex
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2]
        self.pos_data = []
        self.testing_data = []
        self.id = doc_id

        m = Mystem()
        counter = Counter(DEFAULTS)

        if not limit or limit > len(self.sentences):
            limit = len(self.sentences)

        for sentence in self.sentences[:limit]:

            # parse with mystem
            data = m.analyze(sentence)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
开发者ID:Sereni,项目名称:assignments,代码行数:33,代码来源:ageeva_learning.py


示例5: index

def index(name = None):
    if request.args:
        story = request.args['joke'] 
        mystem = Mystem()
        gramm = mystem.analyze(story)
        characters = set()
        for i in gramm:
            if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1):
                s1 = str(i)[str(i).find("'lex': '") + 8:]
                characters.add(s1[:s1.find(        "'")])
        
        file = open("corp.txt", 'r', encoding = "UTF-8")
        f = file.read()[1:].split('\n\n')
        file.close()
        
        file = open("ans.txt", 'w', encoding = "UTF-8")
        for i in f:
            words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ')
            if characters <= set(words):
                f = file.write(i + '\n\n')
        file.close()
        with open("ans.txt", "r", encoding='utf-8') as f:
                content = f.read().split('\n\n')
        return render_template("index.html", content=content)        
    return render_template('index.html')
开发者ID:polinadyakonova,项目名称:homeworks,代码行数:25,代码来源:project.py


示例6: extract

    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x != " ", m.lemmatize(text))
                count_of_rows = 0
                for i in range(0, len(list_of_terms)):
                    if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n':
                        count_of_rows += 1
                    if list_of_terms[i] == ' \n':
                        list_of_terms[i] = '\n'
                if count_of_rows < self.threshold_of_rows_count:
                    first_list_of_terms = list_of_terms
                    list_of_terms = []
                    for i in range(0, len(first_list_of_terms)):
                        if first_list_of_terms[i] != '\n':
                            list_of_terms.append(first_list_of_terms[i])
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
开发者ID:pombredanne,项目名称:senty,代码行数:58,代码来源:standard_extractor_with_counting_number_of_rows.py


示例7: lmtze

def lmtze(textfile):
    m = Mystem()
    text = open(textfile, encoding='utf-8').readlines()
    newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8')
    result_full = []
    for line in text:
        try:
            element = etree.fromstring(line.strip('\n'))
            text_ = element.xpath('text()')
            entities = element.xpath('*')
            result = ['<sent>']
            while text_:
                l = text_.pop(0)
                # open('temp.txt', 'w', encoding='utf-8').write(l)
                # subprocess.call(['C:\\Mystem\\mystem', 'i'])
                l = m.analyze(l)
                # print(l)
                for x in l:
                    if x.get('analysis') is not None:
                        if x.get('analysis') == []:
                            result.append(x['text'])
                        else:
                            result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0])
                    else:
                        continue

                if text_:
                    e = entities.pop(0)
                    e_ = m.analyze(e.text)
                    result.append('<' + e.tag + '>')
                    for x in e_:
                        if x.get('analysis') is not None:
                            if x.get('analysis') == []:
                                result.append(x['text'])
                            else:
                                result.append(x['analysis'][0]['lex'])
                        else:
                            continue
                    result.append('</' + e.tag + '>')
        except Exception:
            continue
        result.append('</sent>')
        result_full.append(result)
        result = []
        print(len(result_full), ' разобралось')
    for sent in result_full:
        prev = ''
        for x in sent:
            if '<' in x and '/' not in x:
                newfile.write(prev + x)
                prev = ''
            elif '_' in x or x.isalpha():
                newfile.write(prev + x)
                prev = ' '
            else:
                newfile.write(x)
        newfile.write('\n')
开发者ID:mannefedov,项目名称:Relext,代码行数:57,代码来源:lmtze.py


示例8: extract

    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list = list_of_terms
                list_of_terms = []
                for term in my_list:
                    if m.analyze(term)[0].get(u'analysis'):
                        if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
                            list_of_terms.append(term)
                        if term == u'не':
                            list_of_terms.append(term)
                    else:
                        list_of_terms.append(term)
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
开发者ID:pombredanne,项目名称:senty,代码行数:56,代码来源:standard_extractor_with_mystem_without_service_parts_of_speech.py


示例9: extract

 def extract(self):
     try:
         #вычисляем, сколько в директории лежит файлов
         input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
         output_data = {}
         list_of_all_n_grams = {}
         m = Mystem()
         #иду по документам
         for file in input_files:
             with open(self.input_directory + '/' + file) as data_file:
                 data = json.load(data_file)
             list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
             text = " ".join(["%s" % term for term in list_of_terms])
             list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
             list_of_n_grams_tuples = {}
             for j in range(0, self.n):
                 list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)])
             list_of_n_grams_strings = []
             for j in range(0, self.n):
                 for gram_tuple in list_of_n_grams_tuples[j]:
                     string_of_n_gram = " ".join(["%s" % term for term in gram_tuple])
                     list_of_n_grams_strings.append(string_of_n_gram)
             output_data[file] = {}
             output_data[file]['id'] = data['id']
             output_data[file]['positive'] = data['positive']
             output_data[file]['sarcasm'] = data['sarcasm']
             output_data[file]['terms'] = {}
             #убираю повторяющиеся слова
             for gram in list_of_n_grams_strings:
                 if gram not in output_data[file]['terms']:
                     output_data[file]['terms'][gram] = 1
                 else:
                     output_data[file]['terms'][gram] += 1
             for gram in output_data[file]['terms'].keys():
                 if gram not in list_of_all_n_grams:
                     list_of_all_n_grams[gram] = 1
                 else:
                     list_of_all_n_grams[gram] += 1
                 #подсчёт tf
                 count_of_n_grams = output_data[file]['terms'][gram]
                 output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0,
                                                     'count': float(count_of_n_grams)}
         for file in input_files:
             #подсчёт idf
             for gram in output_data[file]['terms'].keys():
                 output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram])
             #запись результата
             with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                 json.dump(output_data[file], output_file)
     except Exception:
         return False
     else:
         return True
开发者ID:pombredanne,项目名称:senty,代码行数:53,代码来源:more_than_n_gram_extractor_with_mystem.py


示例10: with_not

def with_not(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))


        # обработка не + (слово)
        nums_of_bigrams = []
        helping_words = [u'совсем', u'очень', u'слишком', u'самый']
        for i in range(0, len(list_of_terms)):
            if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
                if m.analyze(list_of_terms[i+1])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+1))
            elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words:
                if m.analyze(list_of_terms[i+2])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+2))
        for i in range(0, len(nums_of_bigrams)):
            if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1]] = ''
            elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1] - 1] = ''
                list_of_terms[nums_of_bigrams[i][1]] = ''
        list_of_terms = filter(lambda x: x != '', list_of_terms)


        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:44,代码来源:features.py


示例11: Runner

class Runner(object):
    def __init__(self, input_text):
        self.lemmatize = None
        while True:
            response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower()
            if response == "yes":
                print "You should wait for a while"
                self.lemmatize = True
                self.stemmer = Mystem()
                break
            elif response == "no":
                self.lemmatize = False
                break

        self.word_lists = list()
        with open(input_text, "r") as f:
            for line in f:
                line += "."
                if self.lemmatize:
                    lexemes = self.stemmer.lemmatize(line)
                    word_list = list()  # список слов, неразделенных знаками пунктуации
                    for lexeme in lexemes:
                        lexeme = lexeme.strip()
                        if lexeme:
                            if lexeme.translate(None, '.,?!:;()"\' -\t\n'):  # проверка, что лексема не является знаком пунктуации
                                lexeme = lexeme.decode("utf-8")
                                if is_cyrillic(lexeme):
                                    word_list.append(lexeme)
                            else:  # иначе, добавить биграмы из списка и завести новый пустой список
                                self.word_lists.append(word_list)
                                word_list = list()
                else:
                    line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\
                        .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\
                        .replace("--", " -- ").replace(".", " . ")
                    word_list = list()
                    for lexeme in line.split():
                        # проверка, что лексема не является знаком пунктуации
                        lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower()
                        if lexeme:
                            if is_cyrillic(lexeme):
                                word_list.append(lexeme)
                        else:
                            if word_list:
                                self.word_lists.append(word_list)
                            word_list = list()

        train, test = self.split()
        self.lid = Lid(train, test)
        self.lid.run()

    def split(self):
        n = len(self.word_lists)
        train = self.word_lists[:n*9/10]
        test = self.word_lists[n*9/10:]
        return train, test
开发者ID:ialibekov,项目名称:InfoSearch,代码行数:56,代码来源:hw_5.py


示例12: mystem_using

def mystem_using(input_directory, output_directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
    output_data = {}
    m = Mystem()
    for input_file in input_files:
        with open(input_directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
        text = " ".join(["%s" % term for term in list_of_terms])
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])
        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(output_directory + '/' + input_file, 'w') as output_file:
                    json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:19,代码来源:features.py


示例13: Index

class Index(object):

    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.documents = dict()
        self.tokens = list()
        self.terms = dict()
        self.index = list()

        # reading documents, making tokenization
        with open(input_file, "r") as f:
            for i, line in enumerate(f, start=1):
                self.documents[i] = line.decode("utf-8")
                for word in self.stemmer.lemmatize(line):
                    token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                    if token:
                        self.tokens.append((token, i))

        # sorting by tokens first, then by frequency
        self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

        # terminization and building index
        current_term = self.tokens[0][0]
        current_doc_id = self.tokens[0][1]
        doc_ids = [current_doc_id]
        for token, doc_id in self.tokens:
            term = token.lower()
            if term == current_term:
                if doc_id != current_doc_id:
                    doc_ids.append(doc_id)
                    current_doc_id = doc_id
            else:
                self.terms[current_term] = (len(doc_ids), doc_ids)
                self.index.append((current_term, len(doc_ids), doc_ids))
                current_term = term
                current_doc_id = doc_id
                doc_ids = [doc_id]
        self.terms[current_term] = (len(doc_ids), doc_ids)
        self.index.append((current_term, len(doc_ids), doc_ids))

    def print_to_file(self):
        with open("result.txt", "w") as f:
            for term, count, doc_ids in self.index:
                f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids))

    def print_statistics(self):
        terms_num = len(self.terms)
        terms_len = 0.
        for term in self.terms:
            terms_len += len(term)

        print "***********************"
        print "Number of terms = {}".format(terms_num)
        print "Average term length = {}".format(terms_len / terms_num)
        print "***********************"
开发者ID:ialibekov,项目名称:InfoSearch,代码行数:55,代码来源:index.py


示例14: search

def search():
    cn = None
    file = codecs.open('static/articles.xml', 'r', 'utf-8')
    rfile = file.read()
    tree = lxml.etree.fromstring(rfile)
    res = tree.xpath('entry')
    categ = {
        'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии',
        'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения',
        'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы',
        'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе',
        'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты',
        'adict': 'Данные академических словарей', 'doc': 'Нормативные документы',
        'etim': 'Этимология', 'ill': 'Иллюстрации'
    }
    file.close()
    ms = Mystem()
    wordsearch = ms.lemmatize(request.form['search'].lower())[0]

    for i in res:
        if wordsearch == '':
            cn = 'Пустой запрос'
        elif i.text.lower().startswith(wordsearch):
            arr = []
            for j in i.iter():
                for k in dict.keys(categ):
                    if j.tag == k:
                        if j.text != 'null':
                            arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text))
                text = '<br><br>'.join([j for j in arr[1:]])
                text = re.sub('\*', '<b>', text)
                text = re.sub('\#', '</b>', text)
                text = re.sub('\$', '<i>', text)
                text = re.sub('\%', '</i>', text)
                text = re.sub('\@', '<font color="#696969">', text)
                text = re.sub('\+', '</font>', text)
                cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text)
            break
        else:
            cn = 'По Вашему запросу ничего не найдено. <br>' \
                 'Попробуйте использовать "Поиск по тегу" или измените запрос.'
    return render_template('search.html', cn=Markup(cn))
开发者ID:piskunova,项目名称:everydayobjectsdictionary,代码行数:42,代码来源:dictionary.py


示例15: build_pos

    def build_pos(self):

        m = Mystem()
        counter = Counter(DEFAULTS)

        for doc in self.documents:

            # parse with mystem
            data = m.analyze(doc.text)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
开发者ID:Sereni,项目名称:assignments,代码行数:20,代码来源:learning_news.py


示例16: produce_lemmas

def produce_lemmas(connection, tableName, outputTableName):
    mystem = Mystem()
    cursor = connection.cursor()
    inserter = connection.cursor()

    query = 'DELETE FROM `%s`' % outputTableName
    inserter.execute(query)
    connection.commit()

    query = 'SELECT * FROM `%s`' % tableName
    cursor.execute(query)
    query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \
            'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"'
    for id, concept, scheme in cursor:
        lemmas = mystem.analyze(concept)
        for lemma in lemmas:
            for analysis in lemma.get('analysis', []):
                inserter.execute(query % prepare_content(id, analysis))
    connection.commit()

    cursor.close()
开发者ID:Brinit,项目名称:nlp,代码行数:21,代码来源:lemma.py


示例17: fill_mystem

def fill_mystem():
    from pymystem3 import Mystem
    m = Mystem()
    for sentence in get_sentences(1):
        lemmas = m.analyze(sentence.source)
        items = list()
        for lemma in lemmas:
            text = lemma['text']
            analysis = lemma.get('analysis')
            if not analysis:
                text = text.strip()
                if not len(text):
                    print 'spaces = "%s"' % text
                    continue
                if ' ' in text:
                    for item in re.split('\s+', text):
                        items.append("%s   %s ?" % (item, item))
                    print 'several =', "|".join(re.split('\s+', text))
                    continue
                print 'delimiter = "%s"' % text
                items.append("%s   %s ?" % (text, text))
                continue

            if not len(text.strip()):
                raise Exception('Impossible')
            if ' ' in text:
                raise Exception('Impossible')

            lexemes = list()
            for lexeme in analysis:
                print 'lex=', lexeme.get('lex', '-')
                print 'gr=', lexeme.get('gr', '-')
                lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr']))
            items.append("%s   %s" % (text, '  '.join(lexemes)))
        sentence.mystem = '\n'.join(items)
        sentence.save()
开发者ID:2vitalik,项目名称:collocations,代码行数:36,代码来源:other.py


示例18: __init__

    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.documents = dict()
        self.tokens = list()
        self.terms = dict()
        self.index = list()

        # reading documents, making tokenization
        with open(input_file, "r") as f:
            for i, line in enumerate(f, start=1):
                self.documents[i] = line.decode("utf-8")
                for word in self.stemmer.lemmatize(line):
                    token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                    if token:
                        self.tokens.append((token, i))

        # sorting by tokens first, then by frequency
        self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

        # terminization and building index
        current_term = self.tokens[0][0]
        current_doc_id = self.tokens[0][1]
        doc_ids = [current_doc_id]
        for token, doc_id in self.tokens:
            term = token.lower()
            if term == current_term:
                if doc_id != current_doc_id:
                    doc_ids.append(doc_id)
                    current_doc_id = doc_id
            else:
                self.terms[current_term] = (len(doc_ids), doc_ids)
                self.index.append((current_term, len(doc_ids), doc_ids))
                current_term = term
                current_doc_id = doc_id
                doc_ids = [doc_id]
        self.terms[current_term] = (len(doc_ids), doc_ids)
        self.index.append((current_term, len(doc_ids), doc_ids))
开发者ID:ialibekov,项目名称:InfoSearch,代码行数:37,代码来源:index.py


示例19: Mystem

import os, json, dicttoxml
from pymystem3 import Mystem

m = Mystem()
top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks'
for root, dirs, files in os.walk(top):
    for name in files:
        loc = os.path.join(root, name)
        loc_list = loc.split('\\')  #creates list in order to remove path content
        new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending
        dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9]))   #adds new path ending for json.docs
        dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9]))       #adds new path ending for xml docs
        new_name = name.replace('.txt', '')
        if not os.path.exists(dir_marks):   #makes nesessary dirs if not present
            os.makedirs(dir_marks)
        if not os.path.exists(dir_xml):
            os.makedirs(dir_xml)
        with open(loc, "r", encoding = 'utf-8') as doc:
            text_doc = doc.read()
            lines = doc.readlines()
            info = json.dumps(m.analyze(text_doc), ensure_ascii = False)  #creates text file with gram and lem info
        with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks:
            doc_marks.write(info)
        xml = dicttoxml.dicttoxml(info).decode('utf-8')     #converts json to xml
        with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml:
            doc_xml.write(xml)


开发者ID:lylax47,项目名称:Homework-and-Stuffs,代码行数:26,代码来源:Stemmer.py


示例20: poehali

def poehali(csv_input):
	'''
	Основная функция
	csv_input -- файл с таблицей ссылок
	На выходе
	|-xmlFile/
	|---------year/
	|--------------month/
	=========
	|-plain/
	|-------year/
	|------------month/
	=========
	|-html/
	|------year/
	|-----------month/
	|csv_file.csv

	'''
	data = []
	i = 0
	m = Mystem()
	gusina()
	col = ["path", "author", "sex", "birthday", "header", "created", "sphere", "genre_fi", "type", "topic", "chronotop", "style", "audience_age", "audience_level", "audience_size", "source", "publication", "publisher", "publ_year", "medium", "country", "region", "language"]
	time.sleep(3)

	path = os.getcwd()
	path = path + "/"
	csv_file = open(path + "csv_file.csv", "w")
	writer = csv.writer(csv_file,delimiter = ",")
	write 

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python db.db_session函数代码示例发布时间:2022-05-27
下一篇:
Python pymysqlreplication.BinLogStreamReader类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap