Skip to content

Commit e2a5cc0

Browse files
author
Nicholas Bergesen
committed
Take in download func, target .net 8
1 parent 75cad7b commit e2a5cc0

File tree

3 files changed

+60
-100
lines changed

3 files changed

+60
-100
lines changed

Nick.RobotsParser/Nick.RobotsParser.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

33
<PropertyGroup>
4-
<TargetFramework>net6.0</TargetFramework>
4+
<TargetFramework>net8.0</TargetFramework>
55
<ImplicitUsings>enable</ImplicitUsings>
66
<Nullable>enable</Nullable>
77
<PackageId>Nick.RobotsParser</PackageId>

Nick.RobotsParser/RobotsParser.cs

Lines changed: 57 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
using System.Net.Http.Headers;
2-
using System.Text;
3-
using System.Xml;
1+
using System.Xml;
42
using System.Xml.Serialization;
53

64
namespace RobotsParser
@@ -29,40 +27,19 @@ public ProgressEventArgs(string progressMessage)
2927
}
3028
}
3129

32-
public class Robots : IRobots, IDisposable
30+
public class Robots : IRobots
3331
{
3432
private string? _robotsContent;
35-
private readonly HttpClient _client;
3633
private readonly bool _supressSitemapErrors;
34+
private readonly Func<string, Task<string>> _downloadFunc;
3735

3836
public event ProgressEventHandler? OnProgress;
3937
public delegate void ProgressEventHandler(object sender, ProgressEventArgs e);
4038

41-
public Robots(string userAgent, bool supressSitemapErrors = false)
39+
public Robots(Func<string, Task<string>> downloadFunc, bool supressSitemapErrors = false)
4240
{
4341
_supressSitemapErrors = supressSitemapErrors;
44-
HttpClientHandler handler = new HttpClientHandler
45-
{
46-
AutomaticDecompression = System.Net.DecompressionMethods.All,
47-
AllowAutoRedirect = true,
48-
MaxAutomaticRedirections = 15,
49-
};
50-
_client = new HttpClient(handler, true);
51-
_client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", userAgent);
52-
_client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("text/html"));
53-
_client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/xhtml+xml"));
54-
_client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/xml"));
55-
_client.DefaultRequestHeaders.AcceptEncoding.Add(new StringWithQualityHeaderValue("deflate"));
56-
_client.DefaultRequestHeaders.AcceptLanguage.Add(new StringWithQualityHeaderValue("en-ZA"));
57-
_client.DefaultRequestHeaders.AcceptLanguage.Add(new StringWithQualityHeaderValue("en-GB"));
58-
_client.DefaultRequestHeaders.AcceptLanguage.Add(new StringWithQualityHeaderValue("en-US"));
59-
_client.DefaultRequestHeaders.AcceptLanguage.Add(new StringWithQualityHeaderValue("en"));
60-
_client.DefaultRequestHeaders.CacheControl = new CacheControlHeaderValue
61-
{
62-
NoCache = true
63-
};
64-
_client.DefaultRequestHeaders.Connection.Add("keep-alive");
65-
_client.DefaultRequestHeaders.Pragma.Add(new NameValueHeaderValue("no-cache"));
42+
_downloadFunc = downloadFunc;
6643
}
6744

6845
private void RaiseOnProgress(string progressMessage)
@@ -81,52 +58,58 @@ private async Task ParseRobots()
8158
_sitemaps ??= new HashSet<string>();
8259

8360
string? line;
84-
using (StringReader sr = new StringReader(_robotsContent))
61+
using StringReader sr = new(_robotsContent);
62+
Useragent currentAgent = new("*");
63+
while ((line = await sr.ReadLineAsync()) != null)
8564
{
86-
Useragent currentAgent = new Useragent("*");
87-
while ((line = await sr.ReadLineAsync()) != null)
65+
if (line.ToLower().StartsWith(Const.UserAgent.ToLower()))
8866
{
89-
if (line.ToLower().StartsWith(Const.UserAgent.ToLower()))
90-
{
91-
string name = line.Substring(Const.UserAgentLength, line.Length - Const.UserAgentLength).Trim(' ');
92-
currentAgent = new Useragent(name);
93-
_userAgents.Add(currentAgent);
94-
}
95-
else if (line.ToLower().StartsWith(Const.Disallow))
96-
currentAgent.Disallowed.Add(line.Substring(Const.DisallowLength, line.Length - Const.DisallowLength).Trim(' '));
97-
else if (line.ToLower().StartsWith(Const.Allow))
98-
currentAgent.Allowed.Add(line.Substring(Const.AllowLength, line.Length - Const.AllowLength).Trim(' '));
99-
else if (line.ToLower().StartsWith(Const.Sitemap))
100-
_sitemaps.Add(line.Substring(Const.SitemapLength, line.Length - Const.SitemapLength).Trim(' '));
101-
else if (line.ToLower().StartsWith(Const.Crawldelay))
102-
currentAgent.Crawldelay = int.Parse(line.Substring(Const.CrawldelayLength, line.Length - Const.CrawldelayLength).Trim(' '));
103-
else if (line == string.Empty || line[0] == '#' || line == "<!DOCTYPE html> ")
104-
continue;
105-
else
106-
throw new Exception($"Unable to parse {line} in robots.txt");
67+
string name = line.Substring(Const.UserAgentLength, line.Length - Const.UserAgentLength).Trim(' ');
68+
currentAgent = new Useragent(name);
69+
_userAgents.Add(currentAgent);
10770
}
71+
else if (line.ToLower().StartsWith(Const.Disallow))
72+
currentAgent.Disallowed.Add(line.Substring(Const.DisallowLength, line.Length - Const.DisallowLength).Trim(' '));
73+
else if (line.ToLower().StartsWith(Const.Allow))
74+
currentAgent.Allowed.Add(line.Substring(Const.AllowLength, line.Length - Const.AllowLength).Trim(' '));
75+
else if (line.ToLower().StartsWith(Const.Sitemap))
76+
_sitemaps.Add(line.Substring(Const.SitemapLength, line.Length - Const.SitemapLength).Trim(' '));
77+
else if (line.ToLower().StartsWith(Const.Crawldelay))
78+
currentAgent.Crawldelay = int.Parse(line.Substring(Const.CrawldelayLength, line.Length - Const.CrawldelayLength).Trim(' '));
79+
else if (line == string.Empty || line[0] == '#' || line == "<!DOCTYPE html> ")
80+
continue;
81+
else
82+
throw new Exception($"Unable to parse {line} in robots.txt");
10883
}
10984
}
11085

86+
private async Task<string> WebRequest(string url)
87+
{
88+
try
89+
{
90+
return await _downloadFunc(url);
91+
}
92+
catch (HttpRequestException ex)
93+
{
94+
RaiseOnProgress($"Web request returned failed status code: {ex.StatusCode}\r\n{ex.Message}");
95+
}
96+
catch (Exception ex)
97+
{
98+
RaiseOnProgress($"Error during web request:\r\n{ex.Message}");
99+
}
100+
101+
return string.Empty;
102+
}
103+
111104
#region Interface Methods
112105

113106
public async Task<bool> LoadRobotsFromUrl(string robotsUrl)
114107
{
115108
if (!Uri.TryCreate(robotsUrl, UriKind.Absolute, out Uri? robots))
116109
throw new ArgumentException($"Unable to append robots.txt to {robotsUrl}");
117110

118-
try
119-
{
120-
var response = await _client.GetAsync(robots);
121-
response.EnsureSuccessStatusCode();
122-
123-
_robotsContent = await response.Content.ReadAsStringAsync();
124-
}
125-
catch (HttpRequestException e)
126-
{
127-
Console.WriteLine(e.Message);
128-
return false;
129-
}
111+
_robotsContent = await WebRequest(robots.ToString());
112+
if (!string.IsNullOrWhiteSpace(_robotsContent)) return false;
130113

131114
await ParseRobots();
132115
return true;
@@ -145,7 +128,7 @@ public IReadOnlyList<Useragent> UserAgents
145128
get
146129
{
147130
if (_userAgents is null)
148-
throw new RobotsNotloadedException();
131+
throw new RobotsNotloadedException("Useragents is null");
149132
return _userAgents;
150133
}
151134
}
@@ -156,7 +139,7 @@ public HashSet<string> Sitemaps
156139
get
157140
{
158141
if (_sitemaps is null)
159-
throw new RobotsNotloadedException();
142+
throw new RobotsNotloadedException("Sitemaps is null");
160143
return _sitemaps;
161144
}
162145
}
@@ -218,8 +201,8 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
218201
if (tSitemap is null)
219202
throw new ArgumentNullException(nameof(tSitemap), "sitemap requires a value");
220203

221-
var bytes = await _client.GetByteArrayAsync(tSitemap.loc);
222-
if (TryDeserializeXMLStream(bytes, out urlset? urlSet) && urlSet?.url is not null)
204+
var response = await WebRequest(tSitemap.loc);
205+
if (TryDeserializeXMLStream(response, out urlset? urlSet) && urlSet?.url is not null)
223206
return urlSet.url;
224207
else if (!_supressSitemapErrors)
225208
throw new Exception($"Unable to deserialize content from {tSitemap.loc} to type urlset");
@@ -231,8 +214,8 @@ public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)
231214

232215
private async Task<IReadOnlyList<tSitemap>> GetSitemapsInternal(string sitemapUrl)
233216
{
234-
var bytes = await _client.GetByteArrayAsync(sitemapUrl);
235-
if (TryDeserializeXMLStream(bytes, out sitemapindex? sitemapIndex) && sitemapIndex?.sitemap is not null)
217+
var response = await _downloadFunc(sitemapUrl);
218+
if (TryDeserializeXMLStream(response, out sitemapindex? sitemapIndex) && sitemapIndex?.sitemap is not null)
236219
return sitemapIndex.sitemap;
237220
else if (!_supressSitemapErrors)
238221
throw new Exception($"Unable to deserialize content from {sitemapUrl} to type sitemapindex");
@@ -243,8 +226,8 @@ private async Task<IReadOnlyList<tSitemap>> GetSitemapsInternal(string sitemapUr
243226
private readonly List<tUrl> _sitemapLinks = new List<tUrl>(1000000);
244227
private async Task GetSitemapLinksInternal(string siteIndex)
245228
{
246-
var bytes = await _client.GetByteArrayAsync(siteIndex);
247-
if (TryDeserializeXMLStream(bytes, out sitemapindex? sitemapIndex) && sitemapIndex?.sitemap is not null)
229+
var response = await _downloadFunc(siteIndex);
230+
if (TryDeserializeXMLStream(response, out sitemapindex? sitemapIndex) && sitemapIndex?.sitemap is not null)
248231
{
249232
foreach (tSitemap sitemap in sitemapIndex.sitemap)
250233
{
@@ -253,24 +236,23 @@ private async Task GetSitemapLinksInternal(string siteIndex)
253236
}
254237
else
255238
{
256-
if (TryDeserializeXMLStream(bytes, out urlset? urlSet) && urlSet?.url is not null)
239+
if (TryDeserializeXMLStream(response, out urlset? urlSet) && urlSet?.url is not null)
257240
{
258241
_sitemapLinks.AddRange(urlSet.url.ToList());
259242
RaiseOnProgress($"{_sitemapLinks.Count}");
260243
}
261244
}
262245
}
263246

264-
private bool TryDeserializeXMLStream<T>(byte[] bytes, out T? xmlValue)
247+
private bool TryDeserializeXMLStream<T>(string stringValue, out T? xmlValue)
265248
{
266-
var stringVal = Encoding.UTF8.GetString(bytes);
267-
stringVal = StripVersionFromString(stringVal);
249+
stringValue = StripVersionFromString(stringValue);
268250

269-
using StringReader sr = new StringReader(stringVal);
251+
using StringReader sr = new StringReader(stringValue);
270252
return TryDeserializeXMLStream(sr, out xmlValue);
271253
}
272254

273-
private bool TryDeserializeXMLStream<T>(TextReader reader, out T? xmlValue)
255+
private static bool TryDeserializeXMLStream<T>(TextReader reader, out T? xmlValue)
274256
{
275257
try
276258
{
@@ -296,27 +278,5 @@ private string StripVersionFromString(string val)
296278
return val.Remove(0, endChar + 2);
297279
return val;
298280
}
299-
300-
public void Dispose()
301-
{
302-
Dispose(true);
303-
GC.SuppressFinalize(this);
304-
}
305-
306-
~Robots()
307-
{
308-
Dispose(false);
309-
}
310-
311-
protected virtual void Dispose(bool disposing)
312-
{
313-
if (disposing)
314-
{
315-
if (_client != null)
316-
{
317-
_client.Dispose();
318-
}
319-
}
320-
}
321281
}
322282
}

Nick.RobotsParser/Useragent.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public Useragent(string name)
1818
public HashSet<string> Allowed { get; set; }
1919
public HashSet<string> Disallowed { get; set; }
2020
public bool IsAllowed(string path) => Allowed.Any(x => path.Contains(x));
21-
private bool allDissallowed => Disallowed.Contains("/");
22-
public bool IsDisallowed(string path) => allDissallowed || Disallowed.Any(x => path.Contains(x));
21+
private bool AllDissallowed => Disallowed.Contains("/");
22+
public bool IsDisallowed(string path) => AllDissallowed || Disallowed.Any(x => path.Contains(x));
2323
}
2424
}

0 commit comments

Comments
 (0)