1- using System . Net . Http . Headers ;
2- using System . Text ;
3- using System . Xml ;
1+ using System . Xml ;
42using System . Xml . Serialization ;
53
64namespace RobotsParser
@@ -29,40 +27,19 @@ public ProgressEventArgs(string progressMessage)
2927 }
3028 }
3129
32- public class Robots : IRobots , IDisposable
30+ public class Robots : IRobots
3331 {
3432 private string ? _robotsContent ;
35- private readonly HttpClient _client ;
3633 private readonly bool _supressSitemapErrors ;
34+ private readonly Func < string , Task < string > > _downloadFunc ;
3735
3836 public event ProgressEventHandler ? OnProgress ;
3937 public delegate void ProgressEventHandler ( object sender , ProgressEventArgs e ) ;
4038
41- public Robots ( string userAgent , bool supressSitemapErrors = false )
39+ public Robots ( Func < string , Task < string > > downloadFunc , bool supressSitemapErrors = false )
4240 {
4341 _supressSitemapErrors = supressSitemapErrors ;
44- HttpClientHandler handler = new HttpClientHandler
45- {
46- AutomaticDecompression = System . Net . DecompressionMethods . All ,
47- AllowAutoRedirect = true ,
48- MaxAutomaticRedirections = 15 ,
49- } ;
50- _client = new HttpClient ( handler , true ) ;
51- _client . DefaultRequestHeaders . TryAddWithoutValidation ( "User-Agent" , userAgent ) ;
52- _client . DefaultRequestHeaders . Accept . Add ( new MediaTypeWithQualityHeaderValue ( "text/html" ) ) ;
53- _client . DefaultRequestHeaders . Accept . Add ( new MediaTypeWithQualityHeaderValue ( "application/xhtml+xml" ) ) ;
54- _client . DefaultRequestHeaders . Accept . Add ( new MediaTypeWithQualityHeaderValue ( "application/xml" ) ) ;
55- _client . DefaultRequestHeaders . AcceptEncoding . Add ( new StringWithQualityHeaderValue ( "deflate" ) ) ;
56- _client . DefaultRequestHeaders . AcceptLanguage . Add ( new StringWithQualityHeaderValue ( "en-ZA" ) ) ;
57- _client . DefaultRequestHeaders . AcceptLanguage . Add ( new StringWithQualityHeaderValue ( "en-GB" ) ) ;
58- _client . DefaultRequestHeaders . AcceptLanguage . Add ( new StringWithQualityHeaderValue ( "en-US" ) ) ;
59- _client . DefaultRequestHeaders . AcceptLanguage . Add ( new StringWithQualityHeaderValue ( "en" ) ) ;
60- _client . DefaultRequestHeaders . CacheControl = new CacheControlHeaderValue
61- {
62- NoCache = true
63- } ;
64- _client . DefaultRequestHeaders . Connection . Add ( "keep-alive" ) ;
65- _client . DefaultRequestHeaders . Pragma . Add ( new NameValueHeaderValue ( "no-cache" ) ) ;
42+ _downloadFunc = downloadFunc ;
6643 }
6744
6845 private void RaiseOnProgress ( string progressMessage )
@@ -81,52 +58,58 @@ private async Task ParseRobots()
8158 _sitemaps ??= new HashSet < string > ( ) ;
8259
8360 string ? line ;
84- using ( StringReader sr = new StringReader ( _robotsContent ) )
61+ using StringReader sr = new ( _robotsContent ) ;
62+ Useragent currentAgent = new ( "*" ) ;
63+ while ( ( line = await sr . ReadLineAsync ( ) ) != null )
8564 {
86- Useragent currentAgent = new Useragent ( "*" ) ;
87- while ( ( line = await sr . ReadLineAsync ( ) ) != null )
65+ if ( line . ToLower ( ) . StartsWith ( Const . UserAgent . ToLower ( ) ) )
8866 {
89- if ( line . ToLower ( ) . StartsWith ( Const . UserAgent . ToLower ( ) ) )
90- {
91- string name = line . Substring ( Const . UserAgentLength , line . Length - Const . UserAgentLength ) . Trim ( ' ' ) ;
92- currentAgent = new Useragent ( name ) ;
93- _userAgents . Add ( currentAgent ) ;
94- }
95- else if ( line . ToLower ( ) . StartsWith ( Const . Disallow ) )
96- currentAgent . Disallowed . Add ( line . Substring ( Const . DisallowLength , line . Length - Const . DisallowLength ) . Trim ( ' ' ) ) ;
97- else if ( line . ToLower ( ) . StartsWith ( Const . Allow ) )
98- currentAgent . Allowed . Add ( line . Substring ( Const . AllowLength , line . Length - Const . AllowLength ) . Trim ( ' ' ) ) ;
99- else if ( line . ToLower ( ) . StartsWith ( Const . Sitemap ) )
100- _sitemaps . Add ( line . Substring ( Const . SitemapLength , line . Length - Const . SitemapLength ) . Trim ( ' ' ) ) ;
101- else if ( line . ToLower ( ) . StartsWith ( Const . Crawldelay ) )
102- currentAgent . Crawldelay = int . Parse ( line . Substring ( Const . CrawldelayLength , line . Length - Const . CrawldelayLength ) . Trim ( ' ' ) ) ;
103- else if ( line == string . Empty || line [ 0 ] == '#' || line == "<!DOCTYPE html> " )
104- continue ;
105- else
106- throw new Exception ( $ "Unable to parse { line } in robots.txt") ;
67+ string name = line . Substring ( Const . UserAgentLength , line . Length - Const . UserAgentLength ) . Trim ( ' ' ) ;
68+ currentAgent = new Useragent ( name ) ;
69+ _userAgents . Add ( currentAgent ) ;
10770 }
71+ else if ( line . ToLower ( ) . StartsWith ( Const . Disallow ) )
72+ currentAgent . Disallowed . Add ( line . Substring ( Const . DisallowLength , line . Length - Const . DisallowLength ) . Trim ( ' ' ) ) ;
73+ else if ( line . ToLower ( ) . StartsWith ( Const . Allow ) )
74+ currentAgent . Allowed . Add ( line . Substring ( Const . AllowLength , line . Length - Const . AllowLength ) . Trim ( ' ' ) ) ;
75+ else if ( line . ToLower ( ) . StartsWith ( Const . Sitemap ) )
76+ _sitemaps . Add ( line . Substring ( Const . SitemapLength , line . Length - Const . SitemapLength ) . Trim ( ' ' ) ) ;
77+ else if ( line . ToLower ( ) . StartsWith ( Const . Crawldelay ) )
78+ currentAgent . Crawldelay = int . Parse ( line . Substring ( Const . CrawldelayLength , line . Length - Const . CrawldelayLength ) . Trim ( ' ' ) ) ;
79+ else if ( line == string . Empty || line [ 0 ] == '#' || line == "<!DOCTYPE html> " )
80+ continue ;
81+ else
82+ throw new Exception ( $ "Unable to parse { line } in robots.txt") ;
10883 }
10984 }
11085
86+ private async Task < string > WebRequest ( string url )
87+ {
88+ try
89+ {
90+ return await _downloadFunc ( url ) ;
91+ }
92+ catch ( HttpRequestException ex )
93+ {
94+ RaiseOnProgress ( $ "Web request returned failed status code: { ex . StatusCode } \r \n { ex . Message } ") ;
95+ }
96+ catch ( Exception ex )
97+ {
98+ RaiseOnProgress ( $ "Error during web request:\r \n { ex . Message } ") ;
99+ }
100+
101+ return string . Empty ;
102+ }
103+
111104 #region Interface Methods
112105
113106 public async Task < bool > LoadRobotsFromUrl ( string robotsUrl )
114107 {
115108 if ( ! Uri . TryCreate ( robotsUrl , UriKind . Absolute , out Uri ? robots ) )
116109 throw new ArgumentException ( $ "Unable to append robots.txt to { robotsUrl } ") ;
117110
118- try
119- {
120- var response = await _client . GetAsync ( robots ) ;
121- response . EnsureSuccessStatusCode ( ) ;
122-
123- _robotsContent = await response . Content . ReadAsStringAsync ( ) ;
124- }
125- catch ( HttpRequestException e )
126- {
127- Console . WriteLine ( e . Message ) ;
128- return false ;
129- }
111+ _robotsContent = await WebRequest ( robots . ToString ( ) ) ;
112+ if ( ! string . IsNullOrWhiteSpace ( _robotsContent ) ) return false ;
130113
131114 await ParseRobots ( ) ;
132115 return true ;
@@ -145,7 +128,7 @@ public IReadOnlyList<Useragent> UserAgents
145128 get
146129 {
147130 if ( _userAgents is null )
148- throw new RobotsNotloadedException ( ) ;
131+ throw new RobotsNotloadedException ( "Useragents is null" ) ;
149132 return _userAgents ;
150133 }
151134 }
@@ -156,7 +139,7 @@ public HashSet<string> Sitemaps
156139 get
157140 {
158141 if ( _sitemaps is null )
159- throw new RobotsNotloadedException ( ) ;
142+ throw new RobotsNotloadedException ( "Sitemaps is null" ) ;
160143 return _sitemaps ;
161144 }
162145 }
@@ -218,8 +201,8 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
218201 if ( tSitemap is null )
219202 throw new ArgumentNullException ( nameof ( tSitemap ) , "sitemap requires a value" ) ;
220203
221- var bytes = await _client . GetByteArrayAsync ( tSitemap . loc ) ;
222- if ( TryDeserializeXMLStream ( bytes , out urlset ? urlSet ) && urlSet ? . url is not null )
204+ var response = await WebRequest ( tSitemap . loc ) ;
205+ if ( TryDeserializeXMLStream ( response , out urlset ? urlSet ) && urlSet ? . url is not null )
223206 return urlSet . url ;
224207 else if ( ! _supressSitemapErrors )
225208 throw new Exception ( $ "Unable to deserialize content from { tSitemap . loc } to type urlset") ;
@@ -231,8 +214,8 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
231214
232215 private async Task < IReadOnlyList < tSitemap > > GetSitemapsInternal ( string sitemapUrl )
233216 {
234- var bytes = await _client . GetByteArrayAsync ( sitemapUrl ) ;
235- if ( TryDeserializeXMLStream ( bytes , out sitemapindex ? sitemapIndex ) && sitemapIndex ? . sitemap is not null )
217+ var response = await _downloadFunc ( sitemapUrl ) ;
218+ if ( TryDeserializeXMLStream ( response , out sitemapindex ? sitemapIndex ) && sitemapIndex ? . sitemap is not null )
236219 return sitemapIndex . sitemap ;
237220 else if ( ! _supressSitemapErrors )
238221 throw new Exception ( $ "Unable to deserialize content from { sitemapUrl } to type sitemapindex") ;
@@ -243,8 +226,8 @@ private async Task<IReadOnlyList<tSitemap>> GetSitemapsInternal(string sitemapUr
243226 private readonly List < tUrl > _sitemapLinks = new List < tUrl > ( 1000000 ) ;
244227 private async Task GetSitemapLinksInternal ( string siteIndex )
245228 {
246- var bytes = await _client . GetByteArrayAsync ( siteIndex ) ;
247- if ( TryDeserializeXMLStream ( bytes , out sitemapindex ? sitemapIndex ) && sitemapIndex ? . sitemap is not null )
229+ var response = await _downloadFunc ( siteIndex ) ;
230+ if ( TryDeserializeXMLStream ( response , out sitemapindex ? sitemapIndex ) && sitemapIndex ? . sitemap is not null )
248231 {
249232 foreach ( tSitemap sitemap in sitemapIndex . sitemap )
250233 {
@@ -253,24 +236,23 @@ private async Task GetSitemapLinksInternal(string siteIndex)
253236 }
254237 else
255238 {
256- if ( TryDeserializeXMLStream ( bytes , out urlset ? urlSet ) && urlSet ? . url is not null )
239+ if ( TryDeserializeXMLStream ( response , out urlset ? urlSet ) && urlSet ? . url is not null )
257240 {
258241 _sitemapLinks . AddRange ( urlSet . url . ToList ( ) ) ;
259242 RaiseOnProgress ( $ "{ _sitemapLinks . Count } ") ;
260243 }
261244 }
262245 }
263246
264- private bool TryDeserializeXMLStream < T > ( byte [ ] bytes , out T ? xmlValue )
247+ private bool TryDeserializeXMLStream < T > ( string stringValue , out T ? xmlValue )
265248 {
266- var stringVal = Encoding . UTF8 . GetString ( bytes ) ;
267- stringVal = StripVersionFromString ( stringVal ) ;
249+ stringValue = StripVersionFromString ( stringValue ) ;
268250
269- using StringReader sr = new StringReader ( stringVal ) ;
251+ using StringReader sr = new StringReader ( stringValue ) ;
270252 return TryDeserializeXMLStream ( sr , out xmlValue ) ;
271253 }
272254
273- private bool TryDeserializeXMLStream < T > ( TextReader reader , out T ? xmlValue )
255+ private static bool TryDeserializeXMLStream < T > ( TextReader reader , out T ? xmlValue )
274256 {
275257 try
276258 {
@@ -296,27 +278,5 @@ private string StripVersionFromString(string val)
296278 return val . Remove ( 0 , endChar + 2 ) ;
297279 return val ;
298280 }
299-
300- public void Dispose ( )
301- {
302- Dispose ( true ) ;
303- GC . SuppressFinalize ( this ) ;
304- }
305-
306- ~ Robots ( )
307- {
308- Dispose ( false ) ;
309- }
310-
311- protected virtual void Dispose ( bool disposing )
312- {
313- if ( disposing )
314- {
315- if ( _client != null )
316- {
317- _client . Dispose ( ) ;
318- }
319- }
320- }
321281 }
322282}
0 commit comments