Skip to content

Commit df4268b

Browse files
author
Nicholas Bergesen
authored
Merge pull request #3 from Nowongu/master
modify
2 parents 5983e73 + 6bf751f commit df4268b

File tree

2 files changed

+53
-26
lines changed

2 files changed

+53
-26
lines changed

Nick.RobotsParser/RobotsNotloadedException.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ namespace RobotsParser
55
{
66
public class RobotsNotloadedException : Exception
77
{
8-
public RobotsNotloadedException() : base("Please call Load or LoadAsync.")
8+
public RobotsNotloadedException() : base("Please call LoadRobotsFromUrl or LoadRobotsContent.")
99
{
1010
}
1111

Nick.RobotsParser/RobotsParser.cs

Lines changed: 52 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
using System.IO.Compression;
22
using System.Net.Http.Headers;
3+
using System.Runtime.CompilerServices;
34
using System.Text;
5+
using System.Transactions;
46
using System.Xml;
57
using System.Xml.Linq;
68
using System.Xml.Serialization;
@@ -9,17 +11,18 @@ namespace RobotsParser
911
{
1012
public interface IRobots
1113
{
12-
Task Load();
13-
Task Load(string robotsContent);
14+
Task<bool> LoadRobotsFromUrl(string robotsUrl);
15+
Task<bool> LoadRobotsContent(string robotsContent);
1416
IReadOnlyList<Useragent> UserAgents { get; }
1517
IEnumerable<string> Sitemaps { get; }
1618
IEnumerable<string> GetAllowedPaths(string userAgent = "*");
1719
IEnumerable<string> GetDisallowedPaths(string userAgent = "*");
1820
bool IsPathAllowed(string path, string userAgent = "*");
1921
bool IsPathDisallowed(string path, string userAgent = "*");
2022
int GetCrawlDelay(string userAgent = "*");
21-
Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string sitemapUrl = "");
23+
Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string sitemapUrl = null);
2224
Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap);
25+
string? SitemapUrl { get; set; }
2326
}
2427

2528
public class ProgressEventArgs : EventArgs
@@ -41,21 +44,16 @@ public class Robots : IRobots, IDisposable
4144
public event ProgressEventHandler? OnProgress;
4245
public delegate void ProgressEventHandler(object sender, ProgressEventArgs e);
4346

44-
public Robots(Uri websiteUri, string userAgent, bool supressSitemapErrors = false)
45-
{
46-
if(websiteUri is null)
47-
throw new ArgumentNullException(nameof(websiteUri));
48-
49-
if (!Uri.TryCreate(websiteUri, "/robots.txt", out Uri? robots))
50-
throw new ArgumentException($"Unable to append robots.txt to {websiteUri}");
47+
public string? SitemapUrl { get; set; }
5148

49+
public Robots(string userAgent, bool supressSitemapErrors = false)
50+
{
5251
_supressSitemapErrors = supressSitemapErrors;
53-
_robotsUri = robots;
5452
HttpClientHandler handler = new HttpClientHandler
5553
{
5654
AutomaticDecompression = System.Net.DecompressionMethods.All,
5755
AllowAutoRedirect = true,
58-
MaxAutomaticRedirections = 5
56+
MaxAutomaticRedirections = 15,
5957
};
6058
_client = new HttpClient(handler, true);
6159
_client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", userAgent);
@@ -72,13 +70,9 @@ public Robots(Uri websiteUri, string userAgent, bool supressSitemapErrors = fals
7270
NoCache = true
7371
};
7472
_client.DefaultRequestHeaders.Connection.Add("keep-alive");
75-
_client.DefaultRequestHeaders.Host = websiteUri.Host;
7673
_client.DefaultRequestHeaders.Pragma.Add(new NameValueHeaderValue("no-cache"));
7774
}
7875

79-
public Robots(string websiteUri, string userAgent, bool supressSitemapErrors = false)
80-
: this(new Uri(websiteUri), userAgent, supressSitemapErrors) { }
81-
8276
private void RaiseOnProgress(string progressMessage)
8377
{
8478
if (OnProgress is null)
@@ -114,26 +108,45 @@ private async Task ParseRobots()
114108
_sitemaps.Add(line.Substring(Const.SitemapLength, line.Length - Const.SitemapLength).Trim(' '));
115109
else if (line.ToLower().StartsWith(Const.Crawldelay))
116110
currentAgent.Crawldelay = int.Parse(line.Substring(Const.CrawldelayLength, line.Length - Const.CrawldelayLength).Trim(' '));
117-
else if (line == string.Empty || line[0] == '#')
111+
else if (line == string.Empty || line[0] == '#' || line == "<!DOCTYPE html> ")
118112
continue;
119113
else
120114
throw new Exception($"Unable to parse {line} in robots.txt");
121115
}
122116
}
117+
118+
SitemapUrl = _sitemaps.FirstOrDefault();
123119
}
124120

125121
#region Interface Methods
126122

127-
public async Task Load()
123+
public async Task<bool> LoadRobotsFromUrl(string robotsUrl)
128124
{
129-
_robots = await _client.GetStringAsync(_robotsUri);
125+
if (!Uri.TryCreate(robotsUrl, UriKind.Absolute, out Uri? robots))
126+
throw new ArgumentException($"Unable to append robots.txt to {robotsUrl}");
127+
128+
try
129+
{
130+
var response = await _client.GetAsync(robots);
131+
response.EnsureSuccessStatusCode();
132+
133+
_robots = await response.Content.ReadAsStringAsync();
134+
}
135+
catch (HttpRequestException e)
136+
{
137+
Console.WriteLine(e.Message);
138+
return false;
139+
}
140+
130141
await ParseRobots();
142+
return true;
131143
}
132144

133-
public async Task Load(string robotsContent)
145+
public async Task<bool> LoadRobotsContent(string robotsContent)
134146
{
135147
_robots = robotsContent;
136148
await ParseRobots();
149+
return true;
137150
}
138151

139152
private List<Useragent>? _userAgents;
@@ -184,12 +197,26 @@ public int GetCrawlDelay(string userAgent = "*") {
184197
return crawlDelay ?? 0;
185198
}
186199

187-
public async Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string sitemapUrl = "")
200+
public async Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string? sitemapUrl = null)
188201
{
189-
if(_sitemaps is null)
190-
throw new RobotsNotloadedException();
202+
SitemapUrl ??= sitemapUrl;
203+
204+
if (!string.IsNullOrEmpty(SitemapUrl))
205+
{
206+
_sitemaps ??= new HashSet<string>();
207+
_sitemaps.Add(SitemapUrl);
208+
}
209+
210+
if (_sitemaps is null)
211+
{
212+
if (_robots is null)
213+
throw new RobotsNotloadedException("Please call LoadRobotsFromUrl, LoadRobotsContent or pass a sitemap url to GetSitemapIndexes.");
214+
215+
return new List<tSitemap>();
216+
}
191217

192-
if (string.Empty.Equals(sitemapUrl))
218+
//If not value given from user then go through _sitemaps.
219+
if (string.IsNullOrEmpty(SitemapUrl))
193220
{
194221
List<tSitemap> sitemaps = new List<tSitemap>(100000);
195222
if(_sitemaps.Count > 0)
@@ -201,7 +228,7 @@ public async Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string sitemapUrl =
201228
}
202229
else
203230
{
204-
return await GetSitemapsInternal(sitemapUrl);
231+
return await GetSitemapsInternal(SitemapUrl);
205232
}
206233
}
207234

0 commit comments

Comments
 (0)