• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python mparser.ProfileParser类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中mparser.ProfileParser的典型用法代码示例。如果您正苦于以下问题:Python ProfileParser类的具体用法?Python ProfileParser怎么用?Python ProfileParser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了ProfileParser类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    
    #div_header = soup.find_all(name="div", attrs={"class":"neiye-shizi-title"}, limit=1)
    
    divs = soup.find_all(name="div", attrs={"class":"xinwen-txt_3"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,ignore=set(['fax']))
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:25,代码来源:MyHandler.py


示例2: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", clasS_="content-wrapper", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    infos_div = div.find_all('div',attrs={"id":"column-1"})
    if infos_div and len(infos_div) != 0:
        div = infos_div[0]

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:27,代码来源:MyHandler.py


示例3: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class": "NewsArticles"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, "wb") as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(
        lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=999, force_email=True, force_tel=False
    )
    return parser.parse()
开发者ID:yixiaoyang,项目名称:EduParser,代码行数:25,代码来源:MyHandler.py


示例4: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="box_rt01 list", limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    h3s = div.find_all('h3')
    if h3s and len(h3s) != 0:
        title = h3s[0].get_text()
        title = ''.join(title.split())
        print title
        for t in PROFILE_TITLES:
            if t in title:
                employee.title = title
                print "got => " + title
                break
    else:
        print "not found h3"
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,force_email=True)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:34,代码来源:MyHandler.py


示例5: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)
        
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    
    lis = soup.find_all(name="li")
    if not lis and len(lis) != 5:
        div = soup
    else:
        ass = lis[4].find_all('a')
        if len(ass) != 0:
            li_url = ass[0]['href']
            newUrl = urljoin(url,li_url)
            newDoc = get_doc_byUrllib2(newUrl)
            soup = BeautifulSoup(newDoc, Config.SOUP_PARSER)
            mainDiv = soup.find_all('div',attrs={"id":"main"})

            if not mainDiv or len(mainDiv) == 0:
                print "not found main div"
                div = soup
            else:
                div = mainDiv[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:35,代码来源:MyHandler.py


示例6: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"right-nr"})
    if not divs or len(divs) == 0:
        print("div class=right-nr not found")
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    # 使用纯文本方式处理
    lines = div.stripped_strings

    # text=div.get_text(strip=True)
    # ,set_attr_hook=set_attr_hook
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:25,代码来源:MyHandler.py


示例7: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="line20 dataName", limit=1)
    if not divs or len(divs) == 0:
        divs = soup.find_all(name="div", class_="rightArea clearfix ", limit=1)
        if not divs or len(divs) == 0:
            div = soup
        else:
            div = divs[0]
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=999)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:28,代码来源:MyHandler.py


示例8: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td",attrs={"valign":"center"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    lines = []
    tds = div.find_all('td')
    if len(tds) == 0:
        lines = div.stripped_strings
        print "TDS none!"
    else:
        for td in tds:
            string = td.get_text().strip()
            if len(string) < 128:
                string = ''.join(string.split())
                print string
                lines.append(string)

    # 使用纯文本方式处理
    #lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=profile_set_attr_hook,max_line=256)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:34,代码来源:MyHandler.py


示例9: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="td", class_="bd-content", limit=1)
    if not divs or len(divs) == 0:
        divs = soup.find_all(name="td", attrs={"width": "79%"}, limit=1)
        if not divs or len(divs) == 0:
            with open(filename, "wb") as fp:
                content = doc
                fp.write(content)
                fp.close()
            return employee

    div = divs[0]
    with open(filename, "wb") as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
开发者ID:yixiaoyang,项目名称:EduParser,代码行数:27,代码来源:MyHandler.py


示例10: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"right_2"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    researches = [' ',' ']
    tds = div.find_all(name="td",attrs={"bgcolor":"#FFFFFF","class":"ft12","valign":"top"},limit=4)
    if len(tds) == 4:
        researches[0] = tds[2].get_text().strip()
        researches[1] = tds[3].get_text().strip()
        employee.research = researches[0] + ";" +researches[1]
        print "research:" + employee.research 
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,ignore=set(['research']))
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:29,代码来源:MyHandler.py


示例11: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"newsContent"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    details = soup.find_all(name="span", attrs={"id":"ctl00_ContentPlaceHolder1_NewsView1_lbl_NewsContent"}, limit=1)
    if not details or len(details) == 0:
        return employee
    # 使用纯文本方式处理
    lines = details[0].stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:25,代码来源:MyHandler.py


示例12: handler

def handler(tag):
    employee = Employee()
    name_divs = tag.find_all("div",class_="teacher-title")
    if name_divs and len(name_divs) != 0:
        employee.name = name_divs[0].get_text()
        employee.name = ''.join(employee.name.split())
    
    # 使用纯文本方式处理
    lines = tag.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
开发者ID:yixiaoyang,项目名称:pyScripts,代码行数:12,代码来源:MyHandler.py


示例13: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1)
    if not divs or len(divs) == 0:
        div= soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department =  tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title =  tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research =  tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:51,代码来源:MyHandler.py


示例14: handler

def handler(tag):
    employee = Employee()
    ass = tag.find_all('a',class_="orangea")
    if ass and len(ass) != 0:
        employee.name = ass[0].get_text()
        employee.name = ''.join(employee.name.split())
        employee.profile = ass[0]['href']
    
    ass = tag.find_all('a',class_="black01")
    if ass and len(ass) != 0:
        lines = ass[0].stripped_strings
        parser = ProfileParser(lines=lines,employee=employee)
        employee = parser.parse()
    return employee
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:14,代码来源:MyHandler.py


示例15: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"maincontent"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    divs = div.find_all(class_="other")
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    lines = []
    spans = div.find_all('span')
    for child in spans:
        line = child.get_text()
        if line:
            line = ''.join(line.split())
            if not line:
                continue
            if len(line) != 0:
                lines.append(line)
    if len(lines) == 0:
        return emplo
    #email
    #email_div = soup.find_all(name='a',class_="phy-mail")
    #if email_div and len(email_div) != 0:
    #    employee.email = email_div[0].get_text().strip()
    #
    #te_div = soup.find_all(name='a',class_="phy-phone")
    #if te_div and len(te_div) != 0:
    #    employee.tel = te_div[0].get_text().strip()

    # 使用纯文本方式处理
    #lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:50,代码来源:MyHandler.py


示例16: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    div = soup
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
开发者ID:yixiaoyang,项目名称:pyScripts,代码行数:17,代码来源:MyHandler.py


示例17: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    email_image_filename = os.path.join(path, name + "_email.png")
    tel_image_filename = os.path.join(path, name + "_tel.png")

    employee = Employee(name=name, url=url)
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, "wb") as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # email image
    item_divs = div.find_all(name="div", attrs={"class": "item_list"})

    ignores = []
    for div in item_divs:
        string = div.get_text()
        if string and len(string) != 0:
            if u"邮件" in string and len(employee.email) == 0:
                employee.email = image2text(imageSrc(div), email_image_filename, "eng2")
                print(employee.email)
                ignores.append("email")
            elif u"电话" in string and len(employee.tel) == 0:
                employee.tel = image2text(imageSrc(div), tel_image_filename, "eng")
                print(employee.tel)
                ignores.append("tel")

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(
        lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores)
    )
    return parser.parse()
开发者ID:yixiaoyang,项目名称:EduParser,代码行数:43,代码来源:MyHandler.py


示例18: handler

def handler(tag):
    employee = Employee()

    lines = tag.stripped_strings

    ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"})
    if not ass or len(ass) == 0:
        # first line is the name
        for count, line in enumerate(lines):
            employee.name = line
            break
    else:
        employee.name = ass[0].string
        employee.profile = ass[0]["href"]
        employee.url = employee.profile

    parser = ProfileParser(lines=lines, employee=employee)
    employee = parser.parse()
    return employee
开发者ID:yixiaoyang,项目名称:EduParser,代码行数:19,代码来源:MyHandler.py


示例19: handler

def handler(tag):
    
    name_spans = tag.find_all(class_="handle")
    if not name_spans or len(name_spans) == 0:
        return None
    
    # js <span class="handle" onclick="toCardDetailAction('10c07e70-3fb6-42af-aa26-bfab26b6ce0406');" style="color:#2084D2;font-size: 16px;">艾明晶</span>
    
    employee = Employee()
    employee.name = name_spans[0].get_text()
    employee.name = ''.join(employee.name.split())
    
    card_id = name_spans[0]['onclick'][len('toCardDetailAction(\''):-3]
    employee.url = 'http://scse.buaa.edu.cn/buaa-css-web/toCardDetailAction.action?firstSelId=CARD_TMPL_OF_FIRST_NAVI_CN%20&%20secondSelId=CARD_TMPL_OF_ALL_TEACHER_CN%20&cardId='+card_id
    print ("card_id=[%s]"%card_id)

    
    lines = tag.stripped_strings
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:20,代码来源:MyHandler.py


示例20: profile_handler

def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    tables = soup.find_all(name="table",limit=4)
    if len(tables)  < 2:
        return employee

    tabel_content = tables[3]
    with open(filename, 'wb') as fp:
        content = tabel_content.prettify()
        fp.write(content)
        fp.close()

    td = tabel_content.find_all("td",attrs={"valign":"top","width":"577"})
    if not td or len(td) == 0:
        return employee

    # 提取各人信息
    lines = td[0].stripped_strings
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
开发者ID:Jumbo-WJB,项目名称:EduParser,代码行数:24,代码来源:MyHandler.py



注:本文中的mparser.ProfileParser类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python mpd.MPDClient类代码示例发布时间:2022-05-27
下一篇:
Python mp3play.load函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap