Skip to content

Commit 59357ac

Browse files
author
David Robinson
committed
Add the possibility to transform the URL in "OpenLink".
1 parent eb5d18e commit 59357ac

5 files changed

Lines changed: 37 additions & 13 deletions

File tree

CocoCrawler/Builders/PageCrawlJobBuilder.cs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using CocoCrawler.CrawlOutputs;
1+
using AngleSharp.Dom;
2+
using CocoCrawler.CrawlOutputs;
23
using CocoCrawler.Exceptions;
34
using CocoCrawler.Job;
45
using CocoCrawler.Job.PageBrowserActions;
@@ -60,14 +61,17 @@ public PageCrawlJobBuilder ConfigurePageActions(Action<PageActionsBuilder> optio
6061
return this;
6162
}
6263

64+
65+
6366
/// <summary>
6467
/// Adds a task to open a page and perform openLinks tasks.
6568
/// </summary>
6669
/// <param name="linksSelector">The CSS selector to select the element to openLinks.</param>
6770
/// <param name="tasks">The array of openLinks tasks to perform.</param>
6871
/// <param name="options">The action to configure the page actions for the openLinks tasks.</param>
72+
/// <param name="linksSelectorFunc">A function to execute for each matching element, that produces the URL to follow.</param>
6973
/// <returns>The updated <see cref="PageCrawlJobBuilder"/> instance.</returns>
70-
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null)
74+
public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBuilder> jobOptions, Action<PageActionsBuilder>? options = null, Func<IElement, string?>? linksSelectorFunc = null)
7175
{
7276
PageActionsBuilder? pageActionsBuilder = null;
7377

@@ -82,7 +86,7 @@ public PageCrawlJobBuilder OpenLinks(string linksSelector, Action<PageCrawlJobBu
8286

8387
jobOptions(builder);
8488

85-
Tasks.Add(new CrawlPageOpenLinksTask(linksSelector, builder, pageActionsBuilder?.Build()));
89+
Tasks.Add(new CrawlPageOpenLinksTask(linksSelector, builder, pageActionsBuilder?.Build(), linksSelectorFunc));
8690

8791
return this;
8892
}
Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
1-
using CocoCrawler.Builders;
1+
using AngleSharp.Dom;
2+
using CocoCrawler.Builders;
23
using CocoCrawler.Job.PageBrowserActions;
34

45
namespace CocoCrawler.Job.PageTasks;
56

6-
public class CrawlPageOpenLinksTask(string paginationSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null) : IPageCrawlTask
7+
public class CrawlPageOpenLinksTask : IPageCrawlTask
78
{
8-
public string OpenLinksSelector { get; init; } = paginationSelector;
9-
public PageActions? PageActions { get; init; } = pageActions;
10-
public PageCrawlJobBuilder JobBuilder { get; init; } = builder;
9+
public string OpenLinksSelector { get; init; }
10+
public PageActions? PageActions { get; init; }
11+
public Func<IElement, string?>? LinkProcessor { get; }
12+
public PageCrawlJobBuilder JobBuilder { get; init; }
13+
14+
public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null)
15+
{
16+
OpenLinksSelector = linksSelector;
17+
PageActions = pageActions;
18+
JobBuilder = builder;
19+
}
20+
21+
public CrawlPageOpenLinksTask(string linksSelector, PageCrawlJobBuilder builder, PageActions? pageActions = null, Func<IElement, string?>? linkProcessor = null)
22+
{
23+
OpenLinksSelector = linksSelector;
24+
PageActions = pageActions;
25+
LinkProcessor = linkProcessor;
26+
JobBuilder = builder;
27+
}
1128
}
1229

CocoCrawler/Crawler/PuppeteerCrawler.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ protected virtual void HandlePaginateTask(CrawlPagePaginateTask paginate, PageCr
115115

116116
protected virtual void HandleOpenLinksTask(CrawlPageOpenLinksTask openLinks, PageCrawlJob job, List<PageCrawlJob> newJobs)
117117
{
118-
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector);
118+
var urls = Parser!.ParseForLinks(openLinks.OpenLinksSelector, openLinks.LinkProcessor);
119119

120120
Logger?.LogDebug("OpenLinks selector returned {Count} Urls found in openLinks task.", urls.Length);
121121

CocoCrawler/Parser/AngleSharpParser.cs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ public virtual async Task Init(string html)
1717
_document = await context.OpenAsync(req => req.Content(html));
1818
}
1919

20-
public virtual string[] ParseForLinks(string linksSelector)
20+
public virtual string[] ParseForLinks(string linksSelector, Func<IElement, string?>? linkProcessor = null)
2121
{
22+
linkProcessor ??= (element) => element.GetAttribute("href");
23+
2224
return _document!.QuerySelectorAll(linksSelector)
23-
.Select(link => link.GetAttribute("href"))
25+
.Select(link => linkProcessor(link))
2426
.Where(link => link is not null)
2527
.Select(link => link!)
2628
.ToArray();

CocoCrawler/Parser/IParser.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
using CocoCrawler.Job.PageTasks;
1+
using AngleSharp.Dom;
2+
using CocoCrawler.Job.PageTasks;
23
using Newtonsoft.Json.Linq;
34

45
namespace CocoCrawler.Parser;
56

67
public interface IParser
78
{
89
Task Init(string html);
9-
string[] ParseForLinks(string linksSelector);
10+
string[] ParseForLinks(string linksSelector, Func<IElement, string?>? linkProcessor = null);
1011
JArray ExtractList(CrawlPageExtractListTask scrapeList);
1112
JObject ExtractObject(CrawlPageExtractObjectTask task);
1213
}

0 commit comments

Comments
 (0)