Skip to content

Commit 4f3de8e

Browse files
author
Nicholas Bergesen
committed
update robots
1 parent 2dca2c7 commit 4f3de8e

File tree

2 files changed

+21
-47
lines changed

2 files changed

+21
-47
lines changed

Nick.RobotsParser/Nick.RobotsParser.csproj

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,14 @@
55
<ImplicitUsings>enable</ImplicitUsings>
66
<Nullable>enable</Nullable>
77
<PackageId>Nick.RobotsParser</PackageId>
8-
<Version>2.0.5</Version>
8+
<Version>2.0.6</Version>
99
<Authors>Nicholas Bergesen</Authors>
1010
<Company>Nicholas Bergesen</Company>
1111
<PackageReleaseNotes>
12-
- Improve sitemap parsing robustness.
13-
- Ignore sitemap parsing errors.
1412
</PackageReleaseNotes>
1513
<PackageTags>robots,parse robots,web crawling,robots.txt,web scraping,spider,sitemap,sitemap parse</PackageTags>
1614
<Description>
17-
Library for working with robots.txt
18-
- Parse robots.txt into Typed object.
19-
- Lookup Allowed/Disallowed/Crawldelay based on User-Agent.
20-
- Traverse sitemap in robots.txt for urls.
21-
22-
For More info see: https://github.com/nicholasbergesen/RobotsParser/blob/master/README.md
15+
https://github.com/nicholasbergesen/RobotsParser/blob/master/README.md
2316
</Description>
2417
</PropertyGroup>
2518

Nick.RobotsParser/RobotsParser.cs

Lines changed: 19 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
using System.IO.Compression;
2-
using System.Net.Http.Headers;
3-
using System.Runtime.CompilerServices;
1+
using System.Net.Http.Headers;
42
using System.Text;
5-
using System.Transactions;
63
using System.Xml;
7-
using System.Xml.Linq;
84
using System.Xml.Serialization;
95

106
namespace RobotsParser
@@ -14,15 +10,14 @@ public interface IRobots
1410
Task<bool> LoadRobotsFromUrl(string robotsUrl);
1511
Task<bool> LoadRobotsContent(string robotsContent);
1612
IReadOnlyList<Useragent> UserAgents { get; }
17-
IEnumerable<string> Sitemaps { get; }
13+
HashSet<string> Sitemaps { get; }
1814
IEnumerable<string> GetAllowedPaths(string userAgent = "*");
1915
IEnumerable<string> GetDisallowedPaths(string userAgent = "*");
2016
bool IsPathAllowed(string path, string userAgent = "*");
2117
bool IsPathDisallowed(string path, string userAgent = "*");
2218
int GetCrawlDelay(string userAgent = "*");
23-
Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string sitemapUrl = null);
19+
Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string? sitemapUrl = null);
2420
Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap);
25-
string? SitemapUrl { get; set; }
2621
}
2722

2823
public class ProgressEventArgs : EventArgs
@@ -36,16 +31,13 @@ public ProgressEventArgs(string progressMessage)
3631

3732
public class Robots : IRobots, IDisposable
3833
{
39-
private readonly Uri _robotsUri;
40-
private string? _robots;
34+
private string? _robotsContent;
4135
private readonly HttpClient _client;
4236
private readonly bool _supressSitemapErrors;
4337

4438
public event ProgressEventHandler? OnProgress;
4539
public delegate void ProgressEventHandler(object sender, ProgressEventArgs e);
4640

47-
public string? SitemapUrl { get; set; }
48-
4941
public Robots(string userAgent, bool supressSitemapErrors = false)
5042
{
5143
_supressSitemapErrors = supressSitemapErrors;
@@ -82,14 +74,14 @@ private void RaiseOnProgress(string progressMessage)
8274

8375
private async Task ParseRobots()
8476
{
85-
if(_robots is null)
77+
if(_robotsContent is null)
8678
throw new RobotsNotloadedException();
8779

8880
_userAgents = new List<Useragent>();
89-
_sitemaps = new HashSet<string>();
81+
_sitemaps ??= new HashSet<string>();
9082

9183
string? line;
92-
using (StringReader sr = new StringReader(_robots))
84+
using (StringReader sr = new StringReader(_robotsContent))
9385
{
9486
Useragent currentAgent = new Useragent("*");
9587
while ((line = await sr.ReadLineAsync()) != null)
@@ -114,8 +106,6 @@ private async Task ParseRobots()
114106
throw new Exception($"Unable to parse {line} in robots.txt");
115107
}
116108
}
117-
118-
SitemapUrl = _sitemaps.FirstOrDefault();
119109
}
120110

121111
#region Interface Methods
@@ -130,7 +120,7 @@ public async Task<bool> LoadRobotsFromUrl(string robotsUrl)
130120
var response = await _client.GetAsync(robots);
131121
response.EnsureSuccessStatusCode();
132122

133-
_robots = await response.Content.ReadAsStringAsync();
123+
_robotsContent = await response.Content.ReadAsStringAsync();
134124
}
135125
catch (HttpRequestException e)
136126
{
@@ -144,7 +134,7 @@ public async Task<bool> LoadRobotsFromUrl(string robotsUrl)
144134

145135
public async Task<bool> LoadRobotsContent(string robotsContent)
146136
{
147-
_robots = robotsContent;
137+
_robotsContent = robotsContent;
148138
await ParseRobots();
149139
return true;
150140
}
@@ -161,7 +151,7 @@ public IReadOnlyList<Useragent> UserAgents
161151
}
162152

163153
private HashSet<string>? _sitemaps;
164-
public IEnumerable<string> Sitemaps
154+
public HashSet<string> Sitemaps
165155
{
166156
get
167157
{
@@ -199,37 +189,28 @@ public int GetCrawlDelay(string userAgent = "*") {
199189

200190
public async Task<IReadOnlyList<tSitemap>> GetSitemapIndexes(string? sitemapUrl = null)
201191
{
202-
SitemapUrl ??= sitemapUrl;
203-
204-
if (!string.IsNullOrEmpty(SitemapUrl))
192+
if (!string.IsNullOrEmpty(sitemapUrl))
205193
{
206194
_sitemaps ??= new HashSet<string>();
207-
_sitemaps.Add(SitemapUrl);
195+
_sitemaps.Add(sitemapUrl);
208196
}
209197

210198
if (_sitemaps is null)
211199
{
212-
if (_robots is null)
200+
if (_robotsContent is null)
213201
throw new RobotsNotloadedException("Please call LoadRobotsFromUrl, LoadRobotsContent or pass a sitemap url to GetSitemapIndexes.");
214202

215203
return new List<tSitemap>();
216204
}
217205

218-
//If not value given from user then go through _sitemaps.
219-
if (string.IsNullOrEmpty(SitemapUrl))
220-
{
221-
List<tSitemap> sitemaps = new List<tSitemap>(100000);
222-
if(_sitemaps.Count > 0)
223-
{
224-
foreach (var sitemap in _sitemaps)
225-
sitemaps.AddRange(await GetSitemapsInternal(sitemap));
226-
}
227-
return sitemaps;
228-
}
229-
else
206+
List<tSitemap> sitemaps = new List<tSitemap>(100000);
207+
if(_sitemaps.Any())
230208
{
231-
return await GetSitemapsInternal(SitemapUrl);
209+
foreach (var sitemap in _sitemaps)
210+
sitemaps.AddRange(await GetSitemapsInternal(sitemap));
232211
}
212+
213+
return sitemaps;
233214
}
234215

235216
public async Task<IReadOnlyList<tUrl>> GetUrls(tSitemap tSitemap)

0 commit comments

Comments
 (0)