本文整理汇总了Golang中golang.org/x/net/html.NewTokenizer函数的典型用法代码示例。如果您正苦于以下问题:Golang NewTokenizer函数的具体用法?Golang NewTokenizer怎么用?Golang NewTokenizer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了NewTokenizer函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Golang代码示例。
示例1: ProcessHTML
// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure
func (og *OpenGraph) ProcessHTML(buffer io.Reader) error {
z := html.NewTokenizer(buffer)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if z.Err() == io.EOF {
return nil
}
return z.Err()
case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
name, hasAttr := z.TagName()
if atom.Lookup(name) == atom.Body {
return nil // OpenGraph is only in head, so we don't need body
}
if atom.Lookup(name) != atom.Meta || !hasAttr {
continue
}
m := make(map[string]string)
var key, val []byte
for hasAttr {
key, val, hasAttr = z.TagAttr()
m[atom.String(key)] = string(val)
}
og.ProcessMeta(m)
}
}
return nil
}
开发者ID:dyatlov,项目名称:go-opengraph,代码行数:30,代码来源:opengraph.go
示例2: htmlToText
func htmlToText(r io.Reader) []byte {
t := html.NewTokenizer(r)
var out bytes.Buffer
var ignorescore int
for {
switch token := t.Next(); token {
case html.StartTagToken:
if _, ok := ignoretag[string(t.Token().Data)]; ok {
ignorescore++
}
case html.EndTagToken:
if _, ok := ignoretag[string(t.Token().Data)]; ok {
ignorescore--
}
case html.ErrorToken:
return out.Bytes()
case html.CommentToken:
continue
case html.TextToken:
if ignorescore == 0 {
html := strings.TrimSpace(t.Token().Data)
if len(html) > 0 {
fmt.Fprintln(&out, html)
}
}
}
}
}
开发者ID:husio,项目名称:apps,代码行数:30,代码来源:scrap.go
示例3: rewriteHTML
// rewriteHTML scans the HTML for tags with url-valued attributes, and updates
// those values with the urlRewriter function. The updated HTML is output to the
// writer.
func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error {
// Note: This assumes the content is UTF-8.
tokenizer := html.NewTokenizer(reader)
var err error
for err == nil {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
err = tokenizer.Err()
case html.StartTagToken, html.SelfClosingTagToken:
token := tokenizer.Token()
if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok {
for i, attr := range token.Attr {
if urlAttrs.Has(attr.Key) {
token.Attr[i].Val = urlRewriter(attr.Val)
}
}
}
_, err = writer.Write([]byte(token.String()))
default:
_, err = writer.Write(tokenizer.Raw())
}
}
if err != io.EOF {
return err
}
return nil
}
开发者ID:johndmulhausen,项目名称:kubernetes,代码行数:32,代码来源:transport.go
示例4: isHTML
func isHTML(content []byte) bool {
isHTML := false
if len(content) == 0 {
return isHTML
}
if len(content) > 1024 {
content = content[:1024]
}
z := html.NewTokenizer(bytes.NewReader(content))
isFinish := false
for !isFinish {
switch z.Next() {
case html.ErrorToken:
isFinish = true
case html.StartTagToken:
tagName, _ := z.TagName()
if bytes.Equal(tagName, []byte("html")) {
isHTML = true
isFinish = true
}
}
}
return isHTML
}
开发者ID:ReanGD,项目名称:go-web-search,代码行数:26,代码来源:body_parser.go
示例5: scrape
func scrape(r io.Reader) {
z := html.NewTokenizer(r)
buf := &bytes.Buffer{}
L:
for {
tt := z.Next()
tok := z.Token()
switch tt {
case html.StartTagToken:
// if you find a link, replace it with our stylesheet
if tok.DataAtom == atom.Tr {
// check for correct class attr and then switch to
// html.NewTokenizerFragment
}
break
case html.EndTagToken:
// once you reach the end of the head, flush everything left in
// the tokenizer to the buffer
if tok.String() == "</head>" {
buf.Write(z.Buffered())
break L
}
case html.ErrorToken:
// this is left in here for things like tracking pixels that have
// the HTML content type, so our code doesn't break
break L
}
}
}
开发者ID:bentranter,项目名称:bookstore,代码行数:31,代码来源:main.go
示例6: Autodiscover
func Autodiscover(b []byte) (string, error) {
r := bytes.NewReader(b)
z := html.NewTokenizer(r)
for {
if z.Next() == html.ErrorToken {
if err := z.Err(); err == io.EOF {
break
} else {
return "", ErrNoRssLink
}
}
t := z.Token()
switch t.DataAtom {
case atom.Link:
if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
attrs := make(map[string]string)
for _, a := range t.Attr {
attrs[a.Key] = a.Val
}
if attrs["rel"] == "alternate" && attrs["href"] != "" &&
(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
return attrs["href"], nil
}
}
}
}
return "", ErrNoRssLink
}
开发者ID:kissthink,项目名称:goread,代码行数:28,代码来源:autodiscover.go
示例7: ParseHtml
func ParseHtml(r io.Reader, url string) ([]byte, error) {
z := html.NewTokenizer(r)
var newHtml []byte
lastTag := ""
for {
tt := z.Next()
rawHtmlBytes := z.Raw()
switch tt {
case html.ErrorToken:
e := z.Err()
if e.Error() == "EOF" {
return newHtml, nil
} else {
return make([]byte, 0), z.Err()
}
case html.TextToken:
rawHtml := strings.TrimSpace(string(rawHtmlBytes[:]))
if len(rawHtml) > 0 && lastTag == "style" {
newCss := ParseCss(rawHtml, url)
newHtml = append(newHtml, []byte(newCss)...)
} else {
newHtml = append(newHtml, rawHtmlBytes...)
}
case html.DoctypeToken, html.CommentToken, html.EndTagToken:
newHtml = append(newHtml, rawHtmlBytes...)
case html.StartTagToken:
lastTag = flushTagToken(&newHtml, z, url)
case html.SelfClosingTagToken:
flushTagToken(&newHtml, z, url)
}
if tt != html.StartTagToken {
lastTag = ""
}
}
}
开发者ID:gongshw,项目名称:lighthouse,代码行数:35,代码来源:html.go
示例8: GetPriceForBestBuy
func GetPriceForBestBuy(url string) float64 {
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return 0.0
case tt == html.StartTagToken:
t := z.Token()
isSpan := t.Data == "meta"
if isSpan {
for _, attr := range t.Attr {
if attr.Key == "id" && strings.Contains(attr.Val, "schemaorg-offer-price") {
nxt := z.Next()
if nxt == html.TextToken {
t = z.Token()
return parseCurrency(t.Data)
}
}
}
}
}
}
}
开发者ID:vinaygaba,项目名称:pricetell,代码行数:30,代码来源:bestbuy.go
示例9: getLinks
func getLinks(u *url.URL) []*url.URL {
resp, err := http.Get(u.String())
if err != nil {
logs.Log(fmt.Sprintf("Couldn't crawl %s", u))
}
defer resp.Body.Close()
links := make([]*url.URL, 0)
tokenizer := html.NewTokenizer(resp.Body)
for {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
return links
case html.StartTagToken, html.SelfClosingTagToken:
token := tokenizer.Token()
if link, ok := getURL(u, token); ok {
links = append(links, link)
}
}
}
return links
}
开发者ID:fueledbymarvin,项目名称:gocardless,代码行数:25,代码来源:crawler.go
示例10: parseTitle
func parseTitle(resp io.Reader, fallback string) string {
r := io.LimitedReader{
R: resp,
N: 8192,
}
h := html.NewTokenizer(&r)
for {
tt := h.Next()
switch tt {
case html.ErrorToken:
return fallback
case html.StartTagToken:
tag, _ := h.TagName()
if string(tag) == "title" {
nt := h.Next()
switch nt {
case html.ErrorToken:
return "Failed to parse title"
case html.TextToken:
return h.Token().Data
}
}
}
}
return fallback
}
开发者ID:velour,项目名称:holdmypage,代码行数:28,代码来源:main.go
示例11: obtainCsrf
func (w *WebClient) obtainCsrf(b io.Reader) error {
var errorMessage error = nil
z := html.NewTokenizer(b)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return errorMessage
case tt == html.SelfClosingTagToken:
t := z.Token()
isMeta := t.Data == "meta"
if isMeta && len(t.Attr) > 0 {
if (t.Attr[1].Key == "name") && (t.Attr[1].Val == "csrf-token") {
w.csrf = t.Attr[0].Val
log.Debugf("Csrf Token: %s", w.csrf)
} else if (t.Attr[0].Key == "name") && (t.Attr[0].Val == "csrf-token") {
w.csrf = t.Attr[1].Val
log.Debugf("Csrf Token: %s", w.csrf)
}
}
case tt == html.StartTagToken:
t := z.Token()
if (t.Data == "div") && len(t.Attr) > 0 && (t.Attr[0].Key == "id") && (t.Attr[0].Val == "flash_alert") {
z.Next()
errorMessage = errors.New(z.Token().String())
}
}
}
}
开发者ID:odacremolbap,项目名称:concerto,代码行数:31,代码来源:setup.go
示例12: Crawl
// crawl the page
func Crawl(url string, ch chan string) {
resp, _ := http.Get(url_prefix + url)
tokenizer := html.NewTokenizer(resp.Body)
defer resp.Body.Close()
for {
token := tokenizer.Next()
switch {
case token == html.ErrorToken:
// End of page
ch <- "END!"
return
case token == html.StartTagToken:
start_tt := tokenizer.Token()
if start_tt.Data == "div" {
//fmt.Println("get a div! %v", num)
if isSummary(start_tt) {
getQ(*tokenizer, ch)
}
} else {
continue
}
}
}
}
开发者ID:carol-hsu,项目名称:go-study,代码行数:26,代码来源:multiple-web-crawlers.go
示例13: avanza_get_sellprice
/*
* avanza_get_sellprice
*
* Site: Avanza
* Gets the current sellprice from a given httpResponse
*/
func (this *Parse) avanza_get_sellprice(resp *http.Response) float64 {
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return 0.0
case tt == html.StartTagToken:
t := z.Token()
if isCatch := t.Data == "span"; isCatch {
for _, attr := range t.Attr {
if strings.Contains(attr.Val, "sellPrice") {
z.Next()
tt := z.Token()
strval := strings.Replace(tt.String(), ",", ".", -1)
value, _ := strconv.ParseFloat(strval, 64)
return value
}
}
}
}
}
}
开发者ID:Balzzanar,项目名称:golang,代码行数:32,代码来源:parse.go
示例14: GetPriceForWalmart
func GetPriceForWalmart(url string) float64 {
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return 0.0
case tt == html.StartTagToken:
t := z.Token()
isSpan := t.Data == "script"
if isSpan {
for _, attr := range t.Attr {
if attr.Key == "id" && strings.Contains(attr.Val, "tb-djs-wml-base") {
nxt := z.Next()
if nxt == html.TextToken {
return parseJson(z.Token().Data)
}
}
}
}
}
}
}
开发者ID:vinaygaba,项目名称:pricetell,代码行数:29,代码来源:walmart.go
示例15: scrapePageWorker
// scrapePageWorker -- this is the function that does most of the work in parsing the HTML
func scrapePageWorker(page *io.ReadCloser, out chan [2]string, chFinished chan bool) {
defer func() {
chFinished <- true
}()
z := html.NewTokenizer(*page)
// infinite loop to toss state tokens into a url map
for {
var result [2]string
tt := z.Next()
switch {
case tt == html.ErrorToken:
return
case tt == html.StartTagToken:
t := z.Token()
isAnchor := t.Data == "a"
if !isAnchor {
continue
}
if isAnchor {
for _, attr := range t.Attr {
if attr.Key == "id" {
result[0] = attr.Val
}
if attr.Key == "data-href" {
result[1] = attr.Val
out <- result
}
}
}
}
} // end for
}
开发者ID:ohrodr,项目名称:2kcookies,代码行数:34,代码来源:2kcookies.go
示例16: getHTMLContent
func getHTMLContent(r io.Reader, tag []byte) (result string, err error) {
z := html.NewTokenizer(r)
result = ""
valid := 0
cacheLen := len(tag)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
err = z.Err()
return
case html.TextToken:
if valid == 1 {
return string(z.Text()), nil
}
case html.StartTagToken, html.EndTagToken:
tn, _ := z.TagName()
if len(tn) == cacheLen && bytes.Equal(tn[0:cacheLen], tag) {
if tt == html.StartTagToken {
valid = 1
} else {
valid = 0
}
}
}
}
}
开发者ID:pedronasser,项目名称:caddy-search,代码行数:28,代码来源:pipeline.go
示例17: findAllLinks
// Find all children links on a page and the title of the page from an HTTP response
func (w Web) findAllLinks(httpBody io.Reader, baseURL string) (links []string, title string) {
page := html.NewTokenizer(httpBody)
for {
tokenType := page.Next()
// End of the page, we are done
if tokenType == html.ErrorToken {
return
}
token := page.Token()
// Extract the page title
// React uses <title> tags also, but they have got special attributes
if tokenType == html.StartTagToken && token.DataAtom.String() == "title" && len(token.Attr) == 0 {
page.Next()
title = page.Token().Data
}
// Parse a link
if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
href, hasLink := w.extractLink(token)
if hasLink && w.ShouldCrawl(baseURL, href) {
links = append(links, w.unifyURL(href, baseURL))
}
}
}
}
开发者ID:AntoineAugusti,项目名称:crawler,代码行数:27,代码来源:web.go
示例18: load
func (a *HttpWebotsArchive) load() error {
resp, err := http.Get(a.baseurl)
if err != nil {
return err
}
tokenizer := html.NewTokenizer(resp.Body)
nameRx := regexp.MustCompile(fmt.Sprintf(`^webots-(.*)-%s.tar.bz2$`, a.arch))
for {
t := tokenizer.Next()
if t == html.ErrorToken {
err := tokenizer.Err()
if err == io.EOF {
break
}
return err
}
if t != html.StartTagToken {
continue
}
tName, hasAttrib := tokenizer.TagName()
if string(tName) != "a" {
continue
}
if hasAttrib == false {
continue
}
stopped := false
for stopped == false {
key, val, next := tokenizer.TagAttr()
if string(key) != "href" {
continue
}
stopped = !next
// we got a link, test if it has the right prefix
matches := nameRx.FindStringSubmatch(string(val))
if matches == nil {
continue
}
v, err := ParseWebotsVersion(matches[1])
if err != nil {
return err
}
a.versions = append(a.versions, v)
}
}
sort.Sort(&a.versions)
return nil
}
开发者ID:biorob,项目名称:webots-manager,代码行数:60,代码来源:webots_archive.go
示例19: FindIcon
// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found.
func FindIcon(b []byte) (string, error) {
r := bytes.NewReader(b)
z := html.NewTokenizer(r)
for {
if z.Next() == html.ErrorToken {
if err := z.Err(); err == io.EOF {
break
} else {
return "", ErrNoIcon
}
}
t := z.Token()
switch t.DataAtom {
case atom.Link:
if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
attrs := make(map[string]string)
for _, a := range t.Attr {
attrs[a.Key] = a.Val
}
if attrs["rel"] == "shortcut icon" && attrs["href"] != "" {
return attrs["href"], nil
}
}
}
}
return "", ErrNoIcon
}
开发者ID:kissthink,项目名称:goread,代码行数:28,代码来源:autodiscover.go
示例20: extractLinkUrls
func extractLinkUrls(page string) []string {
z := html.NewTokenizer(strings.NewReader(page))
hrefs := make([]string, 10)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
// End of the document, we're done
return hrefs
case tt == html.StartTagToken:
t := z.Token()
isAnchor := t.Data == "a"
if isAnchor {
// we found a link
attributes := t.Attr
for _, attr := range attributes {
if attr.Key == "href" {
href := attr.Val
hrefs = append(hrefs, href)
}
}
}
}
}
}
开发者ID:carriercomm,项目名称:gocrawl,代码行数:29,代码来源:crawler.go
注:本文中的golang.org/x/net/html.NewTokenizer函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论