Przeglądaj źródła

inquire: A tiny program to get metadata about a url

Supports a very limited set of `meta` tags, but works well enough.  It
can generate very simple html, and is easy to use in scripts.
Lucas Stadler 9 lat temu
rodzic
commit
40442c3eb1
1 zmienionych plików z 172 dodań i 0 usunięć
  1. 172 0
      go/inquire/inquire.go

+ 172 - 0
go/inquire/inquire.go

@ -0,0 +1,172 @@
1
package main
2
3
import (
4
	"bytes"
5
	"encoding/json"
6
	"flag"
7
	"fmt"
8
	"net/http"
9
	"os"
10
11
	"code.google.com/p/cascadia"
12
	"code.google.com/p/go.net/html"
13
)
14
15
type PageInfo struct {
16
	RawURL string `json:"url"`
17
	// Title is the value of the `title` element, or the value of the
18
	// meta tag `title` or `twitter:title`.
19
	Title string `json:"title"`
20
	// Description is the value of the meta tag `description` or
21
	// `og:description`.
22
	Description string `json:"description,omitempty"`
23
	// Image is the value of the meta tag `og:description`.
24
	//
25
	// Note that this is expected to be an image, not the icon of the
26
	// webpage.
27
	Image string `json:"image,omitempty"`
28
}
29
30
var config = struct {
31
	output string
32
}{}
33
34
func init() {
35
	flag.StringVar(&config.output, "output", "text", "what format to output")
36
}
37
38
func main() {
39
	flag.Parse()
40
41
	u := flag.Args()[0]
42
43
	info, err := GetPageInfo(u)
44
	if err != nil {
45
		panic(err)
46
	}
47
48
	switch config.output {
49
	case "text":
50
		fmt.Printf("url: %s\ntitle: %s\ndescription: %s\nimage: %s\n",
51
			info.RawURL, info.Title, info.Description, info.Image)
52
	case "html":
53
		fmt.Printf("<h1><a href=\"%s\">%s</a></h1>\n", info.RawURL, info.Title)
54
		if info.Image != "" {
55
			fmt.Printf("<img src=\"%s\" />\n", info.Image)
56
		}
57
		if info.Description != "" {
58
			fmt.Printf("<p>%s</p>\n", info.Description)
59
		}
60
	case "json":
61
		out, err := json.MarshalIndent(info, "", "  ")
62
		if err != nil {
63
			fmt.Fprintln(os.Stderr, err)
64
			os.Exit(1)
65
		}
66
		os.Stdout.Write(out)
67
	case "yaml":
68
		fmt.Printf("- url: %s\n", info.RawURL)
69
		fmt.Printf("  title: %s\n", info.Title)
70
		if info.Description != "" {
71
			fmt.Printf("  description: %s\n", info.Description)
72
		}
73
		if info.Image != "" {
74
			fmt.Printf("  image: %s\n", info.Image)
75
		}
76
	default:
77
		fmt.Fprintln(os.Stderr, "unknown output format:", config.output)
78
		os.Exit(1)
79
	}
80
}
81
82
func GetPageInfo(u string) (*PageInfo, error) {
83
	res, err := http.Get(u)
84
	if err != nil {
85
		return nil, err
86
	}
87
	defer res.Body.Close()
88
89
	tree, err := html.Parse(res.Body)
90
	if err != nil {
91
		return nil, err
92
	}
93
94
	sel := cascadia.MustCompile("meta")
95
	meta := sel.MatchAll(tree)
96
97
	found, title := findTitle(tree)
98
	if !found {
99
		_, title = findProperty(meta, "title", "twitter:title")
100
	}
101
102
	_, description := findProperty(meta, "description", "og:description")
103
	_, image := findProperty(meta, "og:image")
104
105
	return &PageInfo{
106
		Title:       title,
107
		Description: description,
108
		Image:       image,
109
		RawURL:      u,
110
	}, nil
111
}
112
113
func findTitle(tree *html.Node) (found bool, title string) {
114
	sel := cascadia.MustCompile("title")
115
	node := sel.MatchFirst(tree)
116
	if node == nil {
117
		return false, ""
118
	}
119
120
	if node.Type == html.ElementNode {
121
		node = node.FirstChild
122
	}
123
124
	buf := new(bytes.Buffer)
125
	for node != nil {
126
		if node.Type == html.TextNode {
127
			buf.WriteString(node.Data)
128
		}
129
130
		node = node.NextSibling
131
	}
132
133
	return true, string(buf.Bytes())
134
}
135
136
func findProperty(nodes []*html.Node, properties ...string) (found bool, value string) {
137
	props := make(map[string]struct{}, len(properties))
138
	for _, prop := range properties {
139
		props[prop] = struct{}{}
140
	}
141
142
	for _, node := range nodes {
143
		for _, attr := range node.Attr {
144
			if attr.Key != "property" && attr.Key != "name" {
145
				continue
146
			}
147
148
			if _, ok := props[attr.Val]; ok {
149
				found, value := findAttr("content", node)
150
				if found {
151
					return true, value
152
				}
153
			}
154
		}
155
	}
156
157
	return false, ""
158
}
159
160
func findAttr(name string, node *html.Node) (bool, string) {
161
	if node == nil {
162
		return false, ""
163
	}
164
165
	for _, attr := range node.Attr {
166
		if attr.Key == name {
167
			return true, attr.Val
168
		}
169
	}
170
171
	return false, ""
172
}