Golang html.NewTokenizer函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了Golang中golang.org/x/net/html.NewTokenizer函数的典型用法代码示例。如果您正苦于以下问题：Golang NewTokenizer函数的具体用法？Golang NewTokenizer怎么用？Golang NewTokenizer使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了NewTokenizer函数的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的Golang代码示例。

示例1: ProcessHTML

// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure
func (og *OpenGraph) ProcessHTML(buffer io.Reader) error {
	z := html.NewTokenizer(buffer)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if z.Err() == io.EOF {
				return nil
			}
			return z.Err()
		case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
			name, hasAttr := z.TagName()
			if atom.Lookup(name) == atom.Body {
				return nil // OpenGraph is only in head, so we don't need body
			}
			if atom.Lookup(name) != atom.Meta || !hasAttr {
				continue
			}
			m := make(map[string]string)
			var key, val []byte
			for hasAttr {
				key, val, hasAttr = z.TagAttr()
				m[atom.String(key)] = string(val)
			}
			og.ProcessMeta(m)
		}
	}
	return nil
}

开发者ID:dyatlov，项目名称:go-opengraph，代码行数:30，代码来源:opengraph.go

示例2: htmlToText

func htmlToText(r io.Reader) []byte {
	t := html.NewTokenizer(r)

	var out bytes.Buffer

	var ignorescore int
	for {
		switch token := t.Next(); token {
		case html.StartTagToken:
			if _, ok := ignoretag[string(t.Token().Data)]; ok {
				ignorescore++
			}
		case html.EndTagToken:
			if _, ok := ignoretag[string(t.Token().Data)]; ok {
				ignorescore--
			}
		case html.ErrorToken:
			return out.Bytes()
		case html.CommentToken:
			continue
		case html.TextToken:
			if ignorescore == 0 {
				html := strings.TrimSpace(t.Token().Data)
				if len(html) > 0 {
					fmt.Fprintln(&out, html)
				}
			}
		}
	}
}

开发者ID:husio，项目名称:apps，代码行数:30，代码来源:scrap.go

示例3: rewriteHTML

// rewriteHTML scans the HTML for tags with url-valued attributes, and updates
// those values with the urlRewriter function. The updated HTML is output to the
// writer.
func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error {
	// Note: This assumes the content is UTF-8.
	tokenizer := html.NewTokenizer(reader)

	var err error
	for err == nil {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			err = tokenizer.Err()
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok {
				for i, attr := range token.Attr {
					if urlAttrs.Has(attr.Key) {
						token.Attr[i].Val = urlRewriter(attr.Val)
					}
				}
			}
			_, err = writer.Write([]byte(token.String()))
		default:
			_, err = writer.Write(tokenizer.Raw())
		}
	}
	if err != io.EOF {
		return err
	}
	return nil
}

开发者ID:johndmulhausen，项目名称:kubernetes，代码行数:32，代码来源:transport.go

示例4: isHTML

func isHTML(content []byte) bool {
	isHTML := false
	if len(content) == 0 {
		return isHTML
	}
	if len(content) > 1024 {
		content = content[:1024]
	}

	z := html.NewTokenizer(bytes.NewReader(content))
	isFinish := false
	for !isFinish {
		switch z.Next() {
		case html.ErrorToken:
			isFinish = true
		case html.StartTagToken:
			tagName, _ := z.TagName()
			if bytes.Equal(tagName, []byte("html")) {
				isHTML = true
				isFinish = true
			}
		}
	}

	return isHTML
}

开发者ID:ReanGD，项目名称:go-web-search，代码行数:26，代码来源:body_parser.go

示例5: scrape

func scrape(r io.Reader) {
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}

L:
	for {
		tt := z.Next()
		tok := z.Token()

		switch tt {
		case html.StartTagToken:
			// if you find a link, replace it with our stylesheet
			if tok.DataAtom == atom.Tr {
				// check for correct class attr and then switch to
				// html.NewTokenizerFragment
			}
			break
		case html.EndTagToken:
			// once you reach the end of the head, flush everything left in
			// the tokenizer to the buffer
			if tok.String() == "</head>" {
				buf.Write(z.Buffered())
				break L
			}
		case html.ErrorToken:
			// this is left in here for things like tracking pixels that have
			// the HTML content type, so our code doesn't break
			break L
		}
	}
}

开发者ID:bentranter，项目名称:bookstore，代码行数:31，代码来源:main.go

示例6: Autodiscover

func Autodiscover(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoRssLink
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "alternate" && attrs["href"] != "" &&
					(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoRssLink
}

开发者ID:kissthink，项目名称:goread，代码行数:28，代码来源:autodiscover.go

示例7: ParseHtml

func ParseHtml(r io.Reader, url string) ([]byte, error) {
	z := html.NewTokenizer(r)
	var newHtml []byte
	lastTag := ""
	for {
		tt := z.Next()
		rawHtmlBytes := z.Raw()
		switch tt {
		case html.ErrorToken:
			e := z.Err()
			if e.Error() == "EOF" {
				return newHtml, nil
			} else {
				return make([]byte, 0), z.Err()
			}
		case html.TextToken:
			rawHtml := strings.TrimSpace(string(rawHtmlBytes[:]))
			if len(rawHtml) > 0 && lastTag == "style" {
				newCss := ParseCss(rawHtml, url)
				newHtml = append(newHtml, []byte(newCss)...)
			} else {
				newHtml = append(newHtml, rawHtmlBytes...)
			}
		case html.DoctypeToken, html.CommentToken, html.EndTagToken:
			newHtml = append(newHtml, rawHtmlBytes...)
		case html.StartTagToken:
			lastTag = flushTagToken(&newHtml, z, url)
		case html.SelfClosingTagToken:
			flushTagToken(&newHtml, z, url)
		}
		if tt != html.StartTagToken {
			lastTag = ""
		}
	}
}

开发者ID:gongshw，项目名称:lighthouse，代码行数:35，代码来源:html.go

示例8: GetPriceForBestBuy

func GetPriceForBestBuy(url string) float64 {
	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()
			isSpan := t.Data == "meta"
			if isSpan {
				for _, attr := range t.Attr {
					if attr.Key == "id" && strings.Contains(attr.Val, "schemaorg-offer-price") {
						nxt := z.Next()
						if nxt == html.TextToken {
							t = z.Token()
							return parseCurrency(t.Data)
						}
					}
				}
			}
		}
	}
}

开发者ID:vinaygaba，项目名称:pricetell，代码行数:30，代码来源:bestbuy.go

示例9: getLinks

func getLinks(u *url.URL) []*url.URL {

	resp, err := http.Get(u.String())
	if err != nil {
		logs.Log(fmt.Sprintf("Couldn't crawl %s", u))
	}
	defer resp.Body.Close()

	links := make([]*url.URL, 0)
	tokenizer := html.NewTokenizer(resp.Body)
	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			return links
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if link, ok := getURL(u, token); ok {
				links = append(links, link)
			}
		}
	}

	return links
}

开发者ID:fueledbymarvin，项目名称:gocardless，代码行数:25，代码来源:crawler.go

示例10: parseTitle

func parseTitle(resp io.Reader, fallback string) string {
	r := io.LimitedReader{
		R: resp,
		N: 8192,
	}

	h := html.NewTokenizer(&r)
	for {
		tt := h.Next()
		switch tt {
		case html.ErrorToken:
			return fallback
		case html.StartTagToken:
			tag, _ := h.TagName()
			if string(tag) == "title" {
				nt := h.Next()
				switch nt {
				case html.ErrorToken:
					return "Failed to parse title"
				case html.TextToken:
					return h.Token().Data
				}
			}
		}
	}

	return fallback
}

开发者ID:velour，项目名称:holdmypage，代码行数:28，代码来源:main.go

示例11: obtainCsrf

func (w *WebClient) obtainCsrf(b io.Reader) error {
	var errorMessage error = nil
	z := html.NewTokenizer(b)

	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return errorMessage
		case tt == html.SelfClosingTagToken:
			t := z.Token()
			isMeta := t.Data == "meta"
			if isMeta && len(t.Attr) > 0 {
				if (t.Attr[1].Key == "name") && (t.Attr[1].Val == "csrf-token") {
					w.csrf = t.Attr[0].Val
					log.Debugf("Csrf Token: %s", w.csrf)
				} else if (t.Attr[0].Key == "name") && (t.Attr[0].Val == "csrf-token") {
					w.csrf = t.Attr[1].Val
					log.Debugf("Csrf Token: %s", w.csrf)
				}
			}
		case tt == html.StartTagToken:
			t := z.Token()
			if (t.Data == "div") && len(t.Attr) > 0 && (t.Attr[0].Key == "id") && (t.Attr[0].Val == "flash_alert") {
				z.Next()
				errorMessage = errors.New(z.Token().String())
			}
		}
	}

}

开发者ID:odacremolbap，项目名称:concerto，代码行数:31，代码来源:setup.go

示例12: Crawl

// crawl the page
func Crawl(url string, ch chan string) {
	resp, _ := http.Get(url_prefix + url)
	tokenizer := html.NewTokenizer(resp.Body)
	defer resp.Body.Close()

	for {
		token := tokenizer.Next()
		switch {
		case token == html.ErrorToken:
			// End of page
			ch <- "END!"
			return
		case token == html.StartTagToken:
			start_tt := tokenizer.Token()
			if start_tt.Data == "div" {
				//fmt.Println("get a div! %v", num)
				if isSummary(start_tt) {
					getQ(*tokenizer, ch)
				}
			} else {
				continue
			}
		}
	}
}

开发者ID:carol-hsu，项目名称:go-study，代码行数:26，代码来源:multiple-web-crawlers.go

示例13: avanza_get_sellprice

/*
 * avanza_get_sellprice
 *
 * Site: Avanza
 * Gets the current sellprice from a given httpResponse
 */
func (this *Parse) avanza_get_sellprice(resp *http.Response) float64 {
	z := html.NewTokenizer(resp.Body)

	for {
		tt := z.Next()

		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()

			if isCatch := t.Data == "span"; isCatch {
				for _, attr := range t.Attr {
					if strings.Contains(attr.Val, "sellPrice") {
						z.Next()
						tt := z.Token()
						strval := strings.Replace(tt.String(), ",", ".", -1)
						value, _ := strconv.ParseFloat(strval, 64)
						return value
					}
				}
			}
		}
	}
}

开发者ID:Balzzanar，项目名称:golang，代码行数:32，代码来源:parse.go

示例14: GetPriceForWalmart

func GetPriceForWalmart(url string) float64 {
	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()
			isSpan := t.Data == "script"
			if isSpan {
				for _, attr := range t.Attr {
					if attr.Key == "id" && strings.Contains(attr.Val, "tb-djs-wml-base") {
						nxt := z.Next()
						if nxt == html.TextToken {
							return parseJson(z.Token().Data)
						}
					}
				}
			}
		}
	}
}

开发者ID:vinaygaba，项目名称:pricetell，代码行数:29，代码来源:walmart.go

示例15: scrapePageWorker

// scrapePageWorker -- this is the function that does most of the work in parsing the HTML
func scrapePageWorker(page *io.ReadCloser, out chan [2]string, chFinished chan bool) {
	defer func() {
		chFinished <- true
	}()
	z := html.NewTokenizer(*page)
	// infinite loop to toss state tokens into a url map
	for {
		var result [2]string
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return
		case tt == html.StartTagToken:
			t := z.Token()

			isAnchor := t.Data == "a"
			if !isAnchor {
				continue
			}
			if isAnchor {
				for _, attr := range t.Attr {
					if attr.Key == "id" {
						result[0] = attr.Val
					}
					if attr.Key == "data-href" {
						result[1] = attr.Val
						out <- result
					}
				}
			}
		}
	} // end for
}

开发者ID:ohrodr，项目名称:2kcookies，代码行数:34，代码来源:2kcookies.go

示例16: getHTMLContent

func getHTMLContent(r io.Reader, tag []byte) (result string, err error) {
	z := html.NewTokenizer(r)
	result = ""
	valid := 0
	cacheLen := len(tag)

	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			err = z.Err()
			return
		case html.TextToken:
			if valid == 1 {
				return string(z.Text()), nil
			}
		case html.StartTagToken, html.EndTagToken:
			tn, _ := z.TagName()
			if len(tn) == cacheLen && bytes.Equal(tn[0:cacheLen], tag) {
				if tt == html.StartTagToken {
					valid = 1
				} else {
					valid = 0
				}
			}
		}
	}
}

开发者ID:pedronasser，项目名称:caddy-search，代码行数:28，代码来源:pipeline.go

示例17: findAllLinks

// Find all children links on a page and the title of the page from an HTTP response
func (w Web) findAllLinks(httpBody io.Reader, baseURL string) (links []string, title string) {
	page := html.NewTokenizer(httpBody)
	for {
		tokenType := page.Next()
		// End of the page, we are done
		if tokenType == html.ErrorToken {
			return
		}
		token := page.Token()

		// Extract the page title
		// React uses <title> tags also, but they have got special attributes
		if tokenType == html.StartTagToken && token.DataAtom.String() == "title" && len(token.Attr) == 0 {
			page.Next()
			title = page.Token().Data
		}

		// Parse a link
		if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
			href, hasLink := w.extractLink(token)
			if hasLink && w.ShouldCrawl(baseURL, href) {
				links = append(links, w.unifyURL(href, baseURL))
			}
		}
	}
}

开发者ID:AntoineAugusti，项目名称:crawler，代码行数:27，代码来源:web.go

示例18: load

func (a *HttpWebotsArchive) load() error {
	resp, err := http.Get(a.baseurl)
	if err != nil {
		return err
	}

	tokenizer := html.NewTokenizer(resp.Body)

	nameRx := regexp.MustCompile(fmt.Sprintf(`^webots-(.*)-%s.tar.bz2$`, a.arch))

	for {
		t := tokenizer.Next()
		if t == html.ErrorToken {
			err := tokenizer.Err()
			if err == io.EOF {
				break
			}
			return err

		}

		if t != html.StartTagToken {
			continue
		}

		tName, hasAttrib := tokenizer.TagName()
		if string(tName) != "a" {
			continue
		}

		if hasAttrib == false {
			continue
		}

		stopped := false
		for stopped == false {
			key, val, next := tokenizer.TagAttr()
			if string(key) != "href" {
				continue
			}
			stopped = !next
			// we got a link, test if it has the right prefix
			matches := nameRx.FindStringSubmatch(string(val))
			if matches == nil {
				continue
			}

			v, err := ParseWebotsVersion(matches[1])
			if err != nil {
				return err
			}

			a.versions = append(a.versions, v)
		}
	}

	sort.Sort(&a.versions)

	return nil
}

开发者ID:biorob，项目名称:webots-manager，代码行数:60，代码来源:webots_archive.go

示例19: FindIcon

// Returns the href attribute of a <link rel="shortcut icon"> tag or error if not found.
func FindIcon(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoIcon
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "shortcut icon" && attrs["href"] != "" {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoIcon
}

开发者ID:kissthink，项目名称:goread，代码行数:28，代码来源:autodiscover.go

示例20: extractLinkUrls

func extractLinkUrls(page string) []string {

	z := html.NewTokenizer(strings.NewReader(page))

	hrefs := make([]string, 10)

	for {
		tt := z.Next()

		switch {
		case tt == html.ErrorToken:
			// End of the document, we're done
			return hrefs
		case tt == html.StartTagToken:
			t := z.Token()
			isAnchor := t.Data == "a"
			if isAnchor {
				// we found a link
				attributes := t.Attr
				for _, attr := range attributes {
					if attr.Key == "href" {
						href := attr.Val
						hrefs = append(hrefs, href)
					}
				}
			}
		}
	}
}

开发者ID:carriercomm，项目名称:gocrawl，代码行数:29，代码来源:crawler.go

注：本文中的golang.org/x/net/html.NewTokenizer函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。