11using System . IO . Compression ;
22using System . Net . Http . Headers ;
3+ using System . Runtime . CompilerServices ;
34using System . Text ;
5+ using System . Transactions ;
46using System . Xml ;
57using System . Xml . Linq ;
68using System . Xml . Serialization ;
@@ -9,17 +11,18 @@ namespace RobotsParser
911{
1012 public interface IRobots
1113 {
12- Task Load ( ) ;
13- Task Load ( string robotsContent ) ;
14+ Task < bool > LoadRobotsFromUrl ( string robotsUrl ) ;
15+ Task < bool > LoadRobotsContent ( string robotsContent ) ;
1416 IReadOnlyList < Useragent > UserAgents { get ; }
1517 IEnumerable < string > Sitemaps { get ; }
1618 IEnumerable < string > GetAllowedPaths ( string userAgent = "*" ) ;
1719 IEnumerable < string > GetDisallowedPaths ( string userAgent = "*" ) ;
1820 bool IsPathAllowed ( string path , string userAgent = "*" ) ;
1921 bool IsPathDisallowed ( string path , string userAgent = "*" ) ;
2022 int GetCrawlDelay ( string userAgent = "*" ) ;
21- Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string sitemapUrl = "" ) ;
23+ Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string sitemapUrl = null ) ;
2224 Task < IReadOnlyList < tUrl > > GetUrls ( tSitemap tSitemap ) ;
25+ string ? SitemapUrl { get ; set ; }
2326 }
2427
2528 public class ProgressEventArgs : EventArgs
@@ -41,21 +44,16 @@ public class Robots : IRobots, IDisposable
4144 public event ProgressEventHandler ? OnProgress ;
4245 public delegate void ProgressEventHandler ( object sender , ProgressEventArgs e ) ;
4346
44- public Robots ( Uri websiteUri , string userAgent , bool supressSitemapErrors = false )
45- {
46- if ( websiteUri is null )
47- throw new ArgumentNullException ( nameof ( websiteUri ) ) ;
48-
49- if ( ! Uri . TryCreate ( websiteUri , "/robots.txt" , out Uri ? robots ) )
50- throw new ArgumentException ( $ "Unable to append robots.txt to { websiteUri } ") ;
47+ public string ? SitemapUrl { get ; set ; }
5148
49+ public Robots ( string userAgent , bool supressSitemapErrors = false )
50+ {
5251 _supressSitemapErrors = supressSitemapErrors ;
53- _robotsUri = robots ;
5452 HttpClientHandler handler = new HttpClientHandler
5553 {
5654 AutomaticDecompression = System . Net . DecompressionMethods . All ,
5755 AllowAutoRedirect = true ,
58- MaxAutomaticRedirections = 5
56+ MaxAutomaticRedirections = 15 ,
5957 } ;
6058 _client = new HttpClient ( handler , true ) ;
6159 _client . DefaultRequestHeaders . TryAddWithoutValidation ( "User-Agent" , userAgent ) ;
@@ -72,13 +70,9 @@ public Robots(Uri websiteUri, string userAgent, bool supressSitemapErrors = fals
7270 NoCache = true
7371 } ;
7472 _client . DefaultRequestHeaders . Connection . Add ( "keep-alive" ) ;
75- _client . DefaultRequestHeaders . Host = websiteUri . Host ;
7673 _client . DefaultRequestHeaders . Pragma . Add ( new NameValueHeaderValue ( "no-cache" ) ) ;
7774 }
7875
79- public Robots ( string websiteUri , string userAgent , bool supressSitemapErrors = false )
80- : this ( new Uri ( websiteUri ) , userAgent , supressSitemapErrors ) { }
81-
8276 private void RaiseOnProgress ( string progressMessage )
8377 {
8478 if ( OnProgress is null )
@@ -114,26 +108,45 @@ private async Task ParseRobots()
114108 _sitemaps . Add ( line . Substring ( Const . SitemapLength , line . Length - Const . SitemapLength ) . Trim ( ' ' ) ) ;
115109 else if ( line . ToLower ( ) . StartsWith ( Const . Crawldelay ) )
116110 currentAgent . Crawldelay = int . Parse ( line . Substring ( Const . CrawldelayLength , line . Length - Const . CrawldelayLength ) . Trim ( ' ' ) ) ;
117- else if ( line == string . Empty || line [ 0 ] == '#' )
111+ else if ( line == string . Empty || line [ 0 ] == '#' || line == "<!DOCTYPE html> " )
118112 continue ;
119113 else
120114 throw new Exception ( $ "Unable to parse { line } in robots.txt") ;
121115 }
122116 }
117+
118+ SitemapUrl = _sitemaps . FirstOrDefault ( ) ;
123119 }
124120
125121 #region Interface Methods
126122
127- public async Task Load ( )
123+ public async Task < bool > LoadRobotsFromUrl ( string robotsUrl )
128124 {
129- _robots = await _client . GetStringAsync ( _robotsUri ) ;
125+ if ( ! Uri . TryCreate ( robotsUrl , UriKind . Absolute , out Uri ? robots ) )
126+ throw new ArgumentException ( $ "Unable to append robots.txt to { robotsUrl } ") ;
127+
128+ try
129+ {
130+ var response = await _client . GetAsync ( robots ) ;
131+ response . EnsureSuccessStatusCode ( ) ;
132+
133+ _robots = await response . Content . ReadAsStringAsync ( ) ;
134+ }
135+ catch ( HttpRequestException e )
136+ {
137+ Console . WriteLine ( e . Message ) ;
138+ return false ;
139+ }
140+
130141 await ParseRobots ( ) ;
142+ return true ;
131143 }
132144
133- public async Task Load ( string robotsContent )
145+ public async Task < bool > LoadRobotsContent ( string robotsContent )
134146 {
135147 _robots = robotsContent ;
136148 await ParseRobots ( ) ;
149+ return true ;
137150 }
138151
139152 private List < Useragent > ? _userAgents ;
@@ -184,12 +197,26 @@ public int GetCrawlDelay(string userAgent = "*") {
184197 return crawlDelay ?? 0 ;
185198 }
186199
187- public async Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string sitemapUrl = "" )
200+ public async Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string ? sitemapUrl = null )
188201 {
189- if ( _sitemaps is null )
190- throw new RobotsNotloadedException ( ) ;
202+ SitemapUrl ??= sitemapUrl ;
203+
204+ if ( ! string . IsNullOrEmpty ( SitemapUrl ) )
205+ {
206+ _sitemaps ??= new HashSet < string > ( ) ;
207+ _sitemaps . Add ( SitemapUrl ) ;
208+ }
209+
210+ if ( _sitemaps is null )
211+ {
212+ if ( _robots is null )
213+ throw new RobotsNotloadedException ( "Please call LoadRobotsFromUrl, LoadRobotsContent or pass a sitemap url to GetSitemapIndexes." ) ;
214+
215+ return new List < tSitemap > ( ) ;
216+ }
191217
192- if ( string . Empty . Equals ( sitemapUrl ) )
218+ //If not value given from user then go through _sitemaps.
219+ if ( string . IsNullOrEmpty ( SitemapUrl ) )
193220 {
194221 List < tSitemap > sitemaps = new List < tSitemap > ( 100000 ) ;
195222 if ( _sitemaps . Count > 0 )
@@ -201,7 +228,7 @@ public async Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string sitemapUrl =
201228 }
202229 else
203230 {
204- return await GetSitemapsInternal ( sitemapUrl ) ;
231+ return await GetSitemapsInternal ( SitemapUrl ) ;
205232 }
206233 }
207234
0 commit comments