Skip to content

Commit e144b1b

Browse files
committed
parse URLs throughout URLSource
This resolves a bug where percent-encoded href attributes like <a href="libstdc%2B%2B/">libstdc++/</a> were not decoded (and therefore, sometimes, double-encoded later on).
1 parent 7caef03 commit e144b1b

File tree

2 files changed

+76
-40
lines changed

2 files changed

+76
-40
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
# v2.1 (TBD)
2+
3+
Changes:
4+
- Giving an invalid URL in `jobs[].from.url` now results in immediate failure during configuration parsing instead of
5+
indeterministic errors later on.
6+
- It is now an error for `jobs[].from.url` to not have a trailing slash. For now, a missing trailing slash will be added
7+
and execution will continue, but this error will become fatal in a future version.
8+
- The README now includes anti-usecases, in the "Do NOT use if..." section.
9+
10+
Bugfixes:
11+
- Percent-encoded URLs in directory listings are now decoded correctly.
12+
113
# v2.0 (2017-10-16)
214

315
**Backwards-incompatible changes:**

pkg/objects/source.go

Lines changed: 64 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626
"io"
2727
"io/ioutil"
2828
"net/http"
29+
"net/url"
30+
"path"
2931
"regexp"
3032
"strconv"
3133
"strings"
@@ -46,10 +48,10 @@ type Source interface {
4648
//ListEntries returns all files and subdirectories at this path in the
4749
//source. Each result value must have a "/" prefix for subdirectories, or
4850
//none for files.
49-
ListEntries(path string) ([]string, *ListEntriesError)
51+
ListEntries(directoryPath string) ([]string, *ListEntriesError)
5052
//GetFile retrieves the contents and metadata for the file at the given path
5153
//in the source.
52-
GetFile(path string, targetState FileState) (body io.ReadCloser, sourceState FileState, err error)
54+
GetFile(directoryPath string, targetState FileState) (body io.ReadCloser, sourceState FileState, err error)
5355
}
5456

5557
//ListEntriesError is an error that occurs while scraping a directory.
@@ -75,7 +77,8 @@ type FileState struct {
7577

7678
//URLSource describes a source that's accessible via HTTP.
7779
type URLSource struct {
78-
URL string `yaml:"url"`
80+
URLString string `yaml:"url"`
81+
URL *url.URL `yaml:"-"`
7982
//auth options
8083
ClientCertificatePath string `yaml:"cert"`
8184
ClientCertificateKeyPath string `yaml:"key"`
@@ -85,8 +88,28 @@ type URLSource struct {
8588

8689
//Validate implements the Source interface.
8790
func (u *URLSource) Validate(name string) (result []error) {
88-
if u.URL == "" {
91+
if u.URLString == "" {
8992
result = append(result, fmt.Errorf("missing value for %s.url", name))
93+
} else {
94+
//parse URL
95+
var err error
96+
u.URL, err = url.Parse(u.URLString)
97+
if err != nil {
98+
result = append(result, fmt.Errorf("invalid value for %s.url: %s", name, err.Error()))
99+
}
100+
101+
//URL must refer to a directory, i.e. have a trailing slash
102+
if u.URL.Path == "" {
103+
u.URL.Path = "/"
104+
u.URL.RawPath = ""
105+
}
106+
if !strings.HasSuffix(u.URL.Path, "/") {
107+
util.Log(util.LogError, "source URL '%s' does not have a trailing slash (adding one for now; this will become a fatal error in future versions)", u.URLString)
108+
u.URL.Path += "/"
109+
if u.URL.RawPath != "" {
110+
u.URL.RawPath += "/"
111+
}
112+
}
90113
}
91114

92115
// If one of the following is set, the other one needs also to be set
@@ -144,43 +167,42 @@ func (u *URLSource) Connect() error {
144167
return nil
145168
}
146169

147-
//matches scheme prefix (e.g. "http:" or "git+ssh:") at the start of a full URL
148-
//[RFC 3986, 3.1]
149-
var schemeRx = regexp.MustCompile(`^[a-zA-Z][a-zA-Z0-9+.-]*:`)
150-
151170
//matches ".." path element
152171
var dotdotRx = regexp.MustCompile(`(?:^|/)\.\.(?:$|/)`)
153172

154173
//ListEntries implements the Source interface.
155-
func (u URLSource) ListEntries(path string) ([]string, *ListEntriesError) {
174+
func (u URLSource) ListEntries(directoryPath string) ([]string, *ListEntriesError) {
156175
//get full URL of this subdirectory
157-
uri := u.getURLForPath(path)
176+
uri := u.getURLForPath(directoryPath)
158177
//to get a well-formatted directory listing, the directory URL must have a
159178
//trailing slash (most web servers automatically redirect from the URL
160179
//without trailing slash to the URL with trailing slash; others show a
161180
//slightly different directory listing that we cannot parse correctly)
162-
if !strings.HasSuffix(uri, "/") {
163-
uri += "/"
181+
if !strings.HasSuffix(uri.Path, "/") {
182+
uri.Path += "/"
183+
if uri.RawPath != "" {
184+
uri.RawPath += "/"
185+
}
164186
}
165187

166188
util.Log(util.LogDebug, "scraping %s", uri)
167189

168190
//retrieve directory listing
169191
//TODO: This should send "Accept: text/html", but at least Apache and nginx
170192
//don't care about the Accept header, anyway, as far as my testing showed.
171-
response, err := u.HTTPClient.Get(uri)
193+
response, err := u.HTTPClient.Get(uri.String())
172194
if err != nil {
173-
return nil, &ListEntriesError{uri, "GET failed: " + err.Error()}
195+
return nil, &ListEntriesError{uri.String(), "GET failed: " + err.Error()}
174196
}
175197
defer response.Body.Close()
176198

177199
//check that we actually got a directory listing
178200
if !strings.HasPrefix(response.Status, "2") {
179-
return nil, &ListEntriesError{uri, "GET returned status " + response.Status}
201+
return nil, &ListEntriesError{uri.String(), "GET returned status " + response.Status}
180202
}
181203
contentType := response.Header.Get("Content-Type")
182204
if !strings.HasPrefix(contentType, "text/html") {
183-
return nil, &ListEntriesError{uri, "GET returned unexpected Content-Type: " + contentType}
205+
return nil, &ListEntriesError{uri.String(), "GET returned unexpected Content-Type: " + contentType}
184206
}
185207

186208
//find links inside the HTML document
@@ -209,38 +231,45 @@ func (u URLSource) ListEntries(path string) ([]string, *ListEntriesError) {
209231
continue
210232
}
211233

234+
hrefURL, err := url.Parse(href)
235+
if err != nil {
236+
util.Log(util.LogError, "scrape %s: ignoring href attribute '%s' which is not a valid URL", uri.String(), href)
237+
continue
238+
}
239+
212240
//filter external links with full URLs
213-
if schemeRx.MatchString(href) {
241+
if hrefURL.Scheme != "" || hrefURL.Host != "" {
214242
continue
215243
}
216-
//links with trailing slashes are absolute paths as well; either to
217-
//another server, e.g. "//ajax.googleapis.com/jquery.js", or to the
218-
//toplevel of this server, e.g. "/static/site.css")
219-
if strings.HasPrefix(href, "/") {
244+
//ignore internal links, and links with a query part (Apache directory
245+
//listings use these for adjustable sorting)
246+
if hrefURL.RawQuery != "" || hrefURL.Fragment != "" {
220247
continue
221248
}
222-
//links with ".." path elements cannot be guaranteed to be pointing to a
223-
//resource below this directory, so skip them as well (this assumes that
224-
//the sender did already clean his relative links so that no ".." appears
225-
//in legitimate downward links)
226-
if dotdotRx.MatchString(href) {
249+
//ignore absolute paths to the toplevel of this server, e.g. "/static/site.css")
250+
if strings.HasPrefix(hrefURL.Path, "/") {
227251
continue
228252
}
229-
//ignore links with a query part (Apache directory listings use these for
230-
//adjustable sorting)
231-
if strings.Contains(href, "?") {
253+
254+
//cleanup path, but retain trailing slash to tell directories and files apart
255+
linkPath := path.Clean(hrefURL.Path)
256+
if strings.HasSuffix(hrefURL.Path, "/") {
257+
linkPath += "/"
258+
}
259+
//ignore links leading outside the current directory
260+
if dotdotRx.MatchString(hrefURL.Path) {
232261
continue
233262
}
234263

235-
result = append(result, href)
264+
result = append(result, linkPath)
236265
}
237266
}
238267
}
239268
}
240269

241270
//GetFile implements the Source interface.
242-
func (u URLSource) GetFile(path string, targetState FileState) (io.ReadCloser, FileState, error) {
243-
uri := u.getURLForPath(path)
271+
func (u URLSource) GetFile(directoryPath string, targetState FileState) (io.ReadCloser, FileState, error) {
272+
uri := u.getURLForPath(directoryPath).String()
244273

245274
//prepare request to retrieve from source, taking advantage of Etag and
246275
//Last-Modified where possible
@@ -287,12 +316,7 @@ func (u URLSource) GetFile(path string, targetState FileState) (io.ReadCloser, F
287316
}, nil
288317
}
289318

290-
//Return the URL for the given path below this URLSource.
291-
func (u URLSource) getURLForPath(path string) string {
292-
result := u.URL
293-
if !strings.HasSuffix(result, "/") {
294-
result += "/"
295-
}
296-
297-
return result + strings.TrimPrefix(path, "/")
319+
//Return the URL for the given directoryPath below this URLSource.
320+
func (u URLSource) getURLForPath(directoryPath string) *url.URL {
321+
return u.URL.ResolveReference(&url.URL{Path: strings.TrimPrefix(directoryPath, "/")})
298322
}

0 commit comments

Comments
 (0)