1- using System . IO . Compression ;
2- using System . Net . Http . Headers ;
3- using System . Runtime . CompilerServices ;
1+ using System . Net . Http . Headers ;
42using System . Text ;
5- using System . Transactions ;
63using System . Xml ;
7- using System . Xml . Linq ;
84using System . Xml . Serialization ;
95
106namespace RobotsParser
@@ -14,15 +10,14 @@ public interface IRobots
1410 Task < bool > LoadRobotsFromUrl ( string robotsUrl ) ;
1511 Task < bool > LoadRobotsContent ( string robotsContent ) ;
1612 IReadOnlyList < Useragent > UserAgents { get ; }
17- IEnumerable < string > Sitemaps { get ; }
13+ HashSet < string > Sitemaps { get ; }
1814 IEnumerable < string > GetAllowedPaths ( string userAgent = "*" ) ;
1915 IEnumerable < string > GetDisallowedPaths ( string userAgent = "*" ) ;
2016 bool IsPathAllowed ( string path , string userAgent = "*" ) ;
2117 bool IsPathDisallowed ( string path , string userAgent = "*" ) ;
2218 int GetCrawlDelay ( string userAgent = "*" ) ;
23- Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string sitemapUrl = null ) ;
19+ Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string ? sitemapUrl = null ) ;
2420 Task < IReadOnlyList < tUrl > > GetUrls ( tSitemap tSitemap ) ;
25- string ? SitemapUrl { get ; set ; }
2621 }
2722
2823 public class ProgressEventArgs : EventArgs
@@ -36,16 +31,13 @@ public ProgressEventArgs(string progressMessage)
3631
3732 public class Robots : IRobots , IDisposable
3833 {
39- private readonly Uri _robotsUri ;
40- private string ? _robots ;
34+ private string ? _robotsContent ;
4135 private readonly HttpClient _client ;
4236 private readonly bool _supressSitemapErrors ;
4337
4438 public event ProgressEventHandler ? OnProgress ;
4539 public delegate void ProgressEventHandler ( object sender , ProgressEventArgs e ) ;
4640
47- public string ? SitemapUrl { get ; set ; }
48-
4941 public Robots ( string userAgent , bool supressSitemapErrors = false )
5042 {
5143 _supressSitemapErrors = supressSitemapErrors ;
@@ -82,14 +74,14 @@ private void RaiseOnProgress(string progressMessage)
8274
8375 private async Task ParseRobots ( )
8476 {
85- if ( _robots is null )
77+ if ( _robotsContent is null )
8678 throw new RobotsNotloadedException ( ) ;
8779
8880 _userAgents = new List < Useragent > ( ) ;
89- _sitemaps = new HashSet < string > ( ) ;
81+ _sitemaps ?? = new HashSet < string > ( ) ;
9082
9183 string ? line ;
92- using ( StringReader sr = new StringReader ( _robots ) )
84+ using ( StringReader sr = new StringReader ( _robotsContent ) )
9385 {
9486 Useragent currentAgent = new Useragent ( "*" ) ;
9587 while ( ( line = await sr . ReadLineAsync ( ) ) != null )
@@ -114,8 +106,6 @@ private async Task ParseRobots()
114106 throw new Exception ( $ "Unable to parse { line } in robots.txt") ;
115107 }
116108 }
117-
118- SitemapUrl = _sitemaps . FirstOrDefault ( ) ;
119109 }
120110
121111 #region Interface Methods
@@ -130,7 +120,7 @@ public async Task<bool> LoadRobotsFromUrl(string robotsUrl)
130120 var response = await _client . GetAsync ( robots ) ;
131121 response . EnsureSuccessStatusCode ( ) ;
132122
133- _robots = await response . Content . ReadAsStringAsync ( ) ;
123+ _robotsContent = await response . Content . ReadAsStringAsync ( ) ;
134124 }
135125 catch ( HttpRequestException e )
136126 {
@@ -144,7 +134,7 @@ public async Task<bool> LoadRobotsFromUrl(string robotsUrl)
144134
145135 public async Task < bool > LoadRobotsContent ( string robotsContent )
146136 {
147- _robots = robotsContent ;
137+ _robotsContent = robotsContent ;
148138 await ParseRobots ( ) ;
149139 return true ;
150140 }
@@ -161,7 +151,7 @@ public IReadOnlyList<Useragent> UserAgents
161151 }
162152
163153 private HashSet < string > ? _sitemaps ;
164- public IEnumerable < string > Sitemaps
154+ public HashSet < string > Sitemaps
165155 {
166156 get
167157 {
@@ -199,37 +189,28 @@ public int GetCrawlDelay(string userAgent = "*") {
199189
200190 public async Task < IReadOnlyList < tSitemap > > GetSitemapIndexes ( string ? sitemapUrl = null )
201191 {
202- SitemapUrl ??= sitemapUrl ;
203-
204- if ( ! string . IsNullOrEmpty ( SitemapUrl ) )
192+ if ( ! string . IsNullOrEmpty ( sitemapUrl ) )
205193 {
206194 _sitemaps ??= new HashSet < string > ( ) ;
207- _sitemaps . Add ( SitemapUrl ) ;
195+ _sitemaps . Add ( sitemapUrl ) ;
208196 }
209197
210198 if ( _sitemaps is null )
211199 {
212- if ( _robots is null )
200+ if ( _robotsContent is null )
213201 throw new RobotsNotloadedException ( "Please call LoadRobotsFromUrl, LoadRobotsContent or pass a sitemap url to GetSitemapIndexes." ) ;
214202
215203 return new List < tSitemap > ( ) ;
216204 }
217205
218- //If not value given from user then go through _sitemaps.
219- if ( string . IsNullOrEmpty ( SitemapUrl ) )
220- {
221- List < tSitemap > sitemaps = new List < tSitemap > ( 100000 ) ;
222- if ( _sitemaps . Count > 0 )
223- {
224- foreach ( var sitemap in _sitemaps )
225- sitemaps . AddRange ( await GetSitemapsInternal ( sitemap ) ) ;
226- }
227- return sitemaps ;
228- }
229- else
206+ List < tSitemap > sitemaps = new List < tSitemap > ( 100000 ) ;
207+ if ( _sitemaps . Any ( ) )
230208 {
231- return await GetSitemapsInternal ( SitemapUrl ) ;
209+ foreach ( var sitemap in _sitemaps )
210+ sitemaps . AddRange ( await GetSitemapsInternal ( sitemap ) ) ;
232211 }
212+
213+ return sitemaps ;
233214 }
234215
235216 public async Task < IReadOnlyList < tUrl > > GetUrls ( tSitemap tSitemap )
0 commit comments