Browse Source

Archive HTML documents using Prince XMLs

Maybe phantomjs could be used in the future, but I haven't figured out
yet how to take partial screenshots to incrementally cover the entire
page.
Lucas Stadler 8 years ago
parent
commit
00060f62fb
1 changed files with 119 additions and 0 deletions
  1. 119 0
      go/archive/archive.go

+ 119 - 0
go/archive/archive.go

1
package main
2
3
import (
4
	"crypto/rand"
5
	"encoding/json"
6
	"fmt"
7
	"net/url"
8
	"os"
9
	"os/exec"
10
	"path"
11
	"path/filepath"
12
)
13
14
type Archive struct {
15
	Mappings map[string]string `json:"mappings"`
16
}
17
18
func main() {
19
	u, err := url.Parse(os.Args[1])
20
	if err != nil {
21
		exit("url.Parse", err)
22
	}
23
24
	f, err := os.Open("archive.json")
25
	if err != nil {
26
		exit("os.Open", err)
27
	}
28
29
	var archive Archive
30
	dec := json.NewDecoder(f)
31
	err = dec.Decode(&archive)
32
	if err != nil {
33
		exit("dec.Decode", err)
34
	}
35
	f.Close()
36
37
	p, ok := archive.Mappings[u.String()]
38
	if ok {
39
		fmt.Println("==> Archived at", p)
40
		return
41
	}
42
43
	if u.Scheme != "http" && u.Scheme != "https" {
44
		fmt.Fprintf(os.Stderr, "Unknown url scheme %q\n", u.Scheme)
45
		os.Exit(1)
46
	}
47
48
	fmt.Println("==> Archiving", u)
49
50
	buf := make([]byte, 16)
51
	_, err = rand.Read(buf)
52
	if err != nil {
53
		exit("rand.Read", err)
54
	}
55
56
	cmd := exec.Command("prince", "--javascript", "--raster-output", fmt.Sprintf(".archive/%x-%%d.png", buf), u.String())
57
	cmd.Stderr = os.Stderr
58
	cmd.Stdout = os.Stdout
59
	err = cmd.Run()
60
	if err != nil {
61
		exit("prince", err)
62
	}
63
64
	parts, err := filepath.Glob(fmt.Sprintf(".archive/%x-*.png", buf))
65
	if err != nil {
66
		exit("filepath.Glob", err)
67
	}
68
69
	h := fmt.Sprintf(".archive/%x.html", buf)
70
	f, err = os.OpenFile(h, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0660)
71
	if err != nil {
72
		exit("os.OpenFile", err)
73
	}
74
75
	fmt.Fprintf(f, `<doctype html>
76
<html>
77
	<head>
78
		<title>%s</title>
79
	</head>
80
81
	<body>
82
`, u)
83
84
	wd, err := os.Getwd()
85
	if err != nil {
86
		exit("os.Getwd", err)
87
	}
88
89
	for _, p := range parts {
90
		fmt.Fprintf(f, "<img src=%q />\n", path.Join(wd, p))
91
	}
92
93
	fmt.Fprintf(f, "\n\t</body>\n</html>")
94
	f.Close()
95
96
	if archive.Mappings == nil {
97
		archive.Mappings = make(map[string]string, 1)
98
	}
99
	p = fmt.Sprintf("file://%s", path.Join(wd, h))
100
	archive.Mappings[u.String()] = p
101
102
	f, err = os.OpenFile("archive.json", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0660)
103
	if err != nil {
104
		exit("os.OpenFile", err)
105
	}
106
107
	enc := json.NewEncoder(f)
108
	err = enc.Encode(&archive)
109
	if err != nil {
110
		exit("enc.Encode", err)
111
	}
112
	f.Close()
113
114
	fmt.Println("==> Archived at", p)
115
}
116
117
func exit(msg string, err error) {
118
	fmt.Fprintf(os.Stderr, "Error: %s: %s\n", msg, err)
119
}