@@ -26,6 +26,8 @@ import (
2626 "io"
2727 "io/ioutil"
2828 "net/http"
29+ "net/url"
30+ "path"
2931 "regexp"
3032 "strconv"
3133 "strings"
@@ -46,10 +48,10 @@ type Source interface {
4648 //ListEntries returns all files and subdirectories at this path in the
4749 //source. Each result value must have a "/" prefix for subdirectories, or
4850 //none for files.
49- ListEntries (path string ) ([]string , * ListEntriesError )
51+ ListEntries (directoryPath string ) ([]string , * ListEntriesError )
5052 //GetFile retrieves the contents and metadata for the file at the given path
5153 //in the source.
52- GetFile (path string , targetState FileState ) (body io.ReadCloser , sourceState FileState , err error )
54+ GetFile (directoryPath string , targetState FileState ) (body io.ReadCloser , sourceState FileState , err error )
5355}
5456
5557//ListEntriesError is an error that occurs while scraping a directory.
@@ -75,7 +77,8 @@ type FileState struct {
7577
7678//URLSource describes a source that's accessible via HTTP.
7779type URLSource struct {
78- URL string `yaml:"url"`
80+ URLString string `yaml:"url"`
81+ URL * url.URL `yaml:"-"`
7982 //auth options
8083 ClientCertificatePath string `yaml:"cert"`
8184 ClientCertificateKeyPath string `yaml:"key"`
@@ -85,8 +88,28 @@ type URLSource struct {
8588
8689//Validate implements the Source interface.
8790func (u * URLSource ) Validate (name string ) (result []error ) {
88- if u .URL == "" {
91+ if u .URLString == "" {
8992 result = append (result , fmt .Errorf ("missing value for %s.url" , name ))
93+ } else {
94+ //parse URL
95+ var err error
96+ u .URL , err = url .Parse (u .URLString )
97+ if err != nil {
98+ result = append (result , fmt .Errorf ("invalid value for %s.url: %s" , name , err .Error ()))
99+ }
100+
101+ //URL must refer to a directory, i.e. have a trailing slash
102+ if u .URL .Path == "" {
103+ u .URL .Path = "/"
104+ u .URL .RawPath = ""
105+ }
106+ if ! strings .HasSuffix (u .URL .Path , "/" ) {
107+ util .Log (util .LogError , "source URL '%s' does not have a trailing slash (adding one for now; this will become a fatal error in future versions)" , u .URLString )
108+ u .URL .Path += "/"
109+ if u .URL .RawPath != "" {
110+ u .URL .RawPath += "/"
111+ }
112+ }
90113 }
91114
92115 // If one of the following is set, the other one needs also to be set
@@ -144,43 +167,42 @@ func (u *URLSource) Connect() error {
144167 return nil
145168}
146169
147- //matches scheme prefix (e.g. "http:" or "git+ssh:") at the start of a full URL
148- //[RFC 3986, 3.1]
149- var schemeRx = regexp .MustCompile (`^[a-zA-Z][a-zA-Z0-9+.-]*:` )
150-
151170//matches ".." path element
152171var dotdotRx = regexp .MustCompile (`(?:^|/)\.\.(?:$|/)` )
153172
154173//ListEntries implements the Source interface.
155- func (u URLSource ) ListEntries (path string ) ([]string , * ListEntriesError ) {
174+ func (u URLSource ) ListEntries (directoryPath string ) ([]string , * ListEntriesError ) {
156175 //get full URL of this subdirectory
157- uri := u .getURLForPath (path )
176+ uri := u .getURLForPath (directoryPath )
158177 //to get a well-formatted directory listing, the directory URL must have a
159178 //trailing slash (most web servers automatically redirect from the URL
160179 //without trailing slash to the URL with trailing slash; others show a
161180 //slightly different directory listing that we cannot parse correctly)
162- if ! strings .HasSuffix (uri , "/" ) {
163- uri += "/"
181+ if ! strings .HasSuffix (uri .Path , "/" ) {
182+ uri .Path += "/"
183+ if uri .RawPath != "" {
184+ uri .RawPath += "/"
185+ }
164186 }
165187
166188 util .Log (util .LogDebug , "scraping %s" , uri )
167189
168190 //retrieve directory listing
169191 //TODO: This should send "Accept: text/html", but at least Apache and nginx
170192 //don't care about the Accept header, anyway, as far as my testing showed.
171- response , err := u .HTTPClient .Get (uri )
193+ response , err := u .HTTPClient .Get (uri . String () )
172194 if err != nil {
173- return nil , & ListEntriesError {uri , "GET failed: " + err .Error ()}
195+ return nil , & ListEntriesError {uri . String () , "GET failed: " + err .Error ()}
174196 }
175197 defer response .Body .Close ()
176198
177199 //check that we actually got a directory listing
178200 if ! strings .HasPrefix (response .Status , "2" ) {
179- return nil , & ListEntriesError {uri , "GET returned status " + response .Status }
201+ return nil , & ListEntriesError {uri . String () , "GET returned status " + response .Status }
180202 }
181203 contentType := response .Header .Get ("Content-Type" )
182204 if ! strings .HasPrefix (contentType , "text/html" ) {
183- return nil , & ListEntriesError {uri , "GET returned unexpected Content-Type: " + contentType }
205+ return nil , & ListEntriesError {uri . String () , "GET returned unexpected Content-Type: " + contentType }
184206 }
185207
186208 //find links inside the HTML document
@@ -209,38 +231,45 @@ func (u URLSource) ListEntries(path string) ([]string, *ListEntriesError) {
209231 continue
210232 }
211233
234+ hrefURL , err := url .Parse (href )
235+ if err != nil {
236+ util .Log (util .LogError , "scrape %s: ignoring href attribute '%s' which is not a valid URL" , uri .String (), href )
237+ continue
238+ }
239+
212240 //filter external links with full URLs
213- if schemeRx . MatchString ( href ) {
241+ if hrefURL . Scheme != "" || hrefURL . Host != "" {
214242 continue
215243 }
216- //links with trailing slashes are absolute paths as well; either to
217- //another server, e.g. "//ajax.googleapis.com/jquery.js", or to the
218- //toplevel of this server, e.g. "/static/site.css")
219- if strings .HasPrefix (href , "/" ) {
244+ //ignore internal links, and links with a query part (Apache directory
245+ //listings use these for adjustable sorting)
246+ if hrefURL .RawQuery != "" || hrefURL .Fragment != "" {
220247 continue
221248 }
222- //links with ".." path elements cannot be guaranteed to be pointing to a
223- //resource below this directory, so skip them as well (this assumes that
224- //the sender did already clean his relative links so that no ".." appears
225- //in legitimate downward links)
226- if dotdotRx .MatchString (href ) {
249+ //ignore absolute paths to the toplevel of this server, e.g. "/static/site.css")
250+ if strings .HasPrefix (hrefURL .Path , "/" ) {
227251 continue
228252 }
229- //ignore links with a query part (Apache directory listings use these for
230- //adjustable sorting)
231- if strings .Contains (href , "?" ) {
253+
254+ //cleanup path, but retain trailing slash to tell directories and files apart
255+ linkPath := path .Clean (hrefURL .Path )
256+ if strings .HasSuffix (hrefURL .Path , "/" ) {
257+ linkPath += "/"
258+ }
259+ //ignore links leading outside the current directory
260+ if dotdotRx .MatchString (hrefURL .Path ) {
232261 continue
233262 }
234263
235- result = append (result , href )
264+ result = append (result , linkPath )
236265 }
237266 }
238267 }
239268}
240269
241270//GetFile implements the Source interface.
242- func (u URLSource ) GetFile (path string , targetState FileState ) (io.ReadCloser , FileState , error ) {
243- uri := u .getURLForPath (path )
271+ func (u URLSource ) GetFile (directoryPath string , targetState FileState ) (io.ReadCloser , FileState , error ) {
272+ uri := u .getURLForPath (directoryPath ). String ( )
244273
245274 //prepare request to retrieve from source, taking advantage of Etag and
246275 //Last-Modified where possible
@@ -287,12 +316,7 @@ func (u URLSource) GetFile(path string, targetState FileState) (io.ReadCloser, F
287316 }, nil
288317}
289318
290- //Return the URL for the given path below this URLSource.
291- func (u URLSource ) getURLForPath (path string ) string {
292- result := u .URL
293- if ! strings .HasSuffix (result , "/" ) {
294- result += "/"
295- }
296-
297- return result + strings .TrimPrefix (path , "/" )
319+ //Return the URL for the given directoryPath below this URLSource.
320+ func (u URLSource ) getURLForPath (directoryPath string ) * url.URL {
321+ return u .URL .ResolveReference (& url.URL {Path : strings .TrimPrefix (directoryPath , "/" )})
298322}
0 commit comments