-
Notifications
You must be signed in to change notification settings - Fork 0
Add support for partial results tracking on a scan #68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
15083ff
c5ee54f
ca7fa87
fccfe08
1f63b37
60b44eb
2fc13dd
4f437a8
1a71f8d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,10 +9,11 @@ | |
| package de.rub.nds.crawler.core; | ||
|
|
||
| import de.rub.nds.crawler.data.ScanConfig; | ||
| import de.rub.nds.crawler.data.ScanJobDescription; | ||
| import de.rub.nds.crawler.data.ScanTarget; | ||
| import de.rub.nds.crawler.persistence.IPersistenceProvider; | ||
| import de.rub.nds.crawler.util.CanceallableThreadPoolExecutor; | ||
| import de.rub.nds.scanner.core.execution.NamedThreadFactory; | ||
| import java.util.concurrent.Future; | ||
| import java.util.concurrent.LinkedBlockingDeque; | ||
| import java.util.concurrent.ThreadPoolExecutor; | ||
| import java.util.concurrent.TimeUnit; | ||
|
|
@@ -41,6 +42,9 @@ public abstract class BulkScanWorker<T extends ScanConfig> { | |
| /** The scan configuration for this worker */ | ||
| protected final T scanConfig; | ||
|
|
||
| /** The persistence provider for writing partial results */ | ||
| private IPersistenceProvider persistenceProvider; | ||
|
|
||
| /** | ||
| * Calls the inner scan function and may handle cleanup. This is needed to wrap the scanner into | ||
| * a future object such that we can handle timeouts properly. | ||
|
|
@@ -74,31 +78,53 @@ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThread | |
| * Handles a scan target by submitting it to the executor. If init was not called, it will | ||
| * initialize itself. In this case it will also clean up itself if all jobs are done. | ||
| * | ||
| * @param scanTarget The target to scan. | ||
| * @return A future that resolves to the scan result once the scan is done. | ||
| * <p>Returns a {@link ScheduledScan} that represents the entire scan lifecycle, allowing | ||
| * callers to: | ||
| * | ||
| * <ul> | ||
| * <li>Get partial results as the scan progresses | ||
| * <li>Register listeners for progress updates | ||
| * <li>Wait for the final result | ||
| * </ul> | ||
| * | ||
| * @param jobDescription The job description for this scan. | ||
| * @return A ScheduledScan representing the scan lifecycle | ||
| */ | ||
| public Future<Document> handle(ScanTarget scanTarget) { | ||
| public ScheduledScan handle(ScanJobDescription jobDescription) { | ||
| // if we initialized ourself, we also clean up ourself | ||
| shouldCleanupSelf.weakCompareAndSetAcquire(false, init()); | ||
| activeJobs.incrementAndGet(); | ||
| return timeoutExecutor.submit( | ||
|
|
||
| ScheduledScan scheduledScan = new ScheduledScan(); | ||
|
|
||
| timeoutExecutor.submit( | ||
| () -> { | ||
| Document result = scan(scanTarget); | ||
| if (activeJobs.decrementAndGet() == 0 && shouldCleanupSelf.get()) { | ||
| cleanup(); | ||
| try { | ||
| Document result = scan(jobDescription, scheduledScan); | ||
| scheduledScan.complete(result); | ||
| if (activeJobs.decrementAndGet() == 0 && shouldCleanupSelf.get()) { | ||
| cleanup(); | ||
| } | ||
| } catch (Exception e) { | ||
| scheduledScan.completeExceptionally(e); | ||
| activeJobs.decrementAndGet(); | ||
| throw e; | ||
| } | ||
| return result; | ||
| }); | ||
|
|
||
| return scheduledScan; | ||
| } | ||
|
|
||
| /** | ||
| * Scans a target and returns the result as a Document. This is the core scanning functionality | ||
| * that must be implemented by subclasses. | ||
| * | ||
| * @param scanTarget The target to scan | ||
| * @param jobDescription The job description containing target and metadata | ||
| * @param scheduledScan The scheduled scan for reporting progress via {@link | ||
| * ScheduledScan#updateResult} | ||
| * @return The scan result as a Document | ||
| */ | ||
| public abstract Document scan(ScanTarget scanTarget); | ||
| public abstract Document scan(ScanJobDescription jobDescription, ScheduledScan scheduledScan); | ||
|
|
||
| /** | ||
| * Initializes this worker if it hasn't been initialized yet. This method is thread-safe and | ||
|
|
@@ -161,4 +187,26 @@ public final boolean cleanup() { | |
| * specific resources. | ||
| */ | ||
| protected abstract void cleanupInternal(); | ||
|
|
||
| /** | ||
| * Sets the persistence provider for writing partial results. | ||
| * | ||
| * @param persistenceProvider The persistence provider to use | ||
| */ | ||
| public void setPersistenceProvider(IPersistenceProvider persistenceProvider) { | ||
| this.persistenceProvider = persistenceProvider; | ||
| } | ||
|
Comment on lines
+196
to
+198
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we instead pass this in the constructor? Is there a use-case to having this as a variable instead of a constant? |
||
|
|
||
| /** | ||
| * Persists a partial scan result. This method can be called by subclasses during scanning to | ||
| * save intermediate results. | ||
| * | ||
| * @param jobDescription The job description for the scan | ||
| * @param partialResult The partial result document to persist | ||
| */ | ||
| protected void persistPartialResult(ScanJobDescription jobDescription, Document partialResult) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this not somehow linked to
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see two approaches:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or are there cases where one function is called and not the other? |
||
| if (persistenceProvider != null) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this check? |
||
| persistenceProvider.upsertPartialResult(jobDescription, partialResult); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| /* | ||
| * TLS-Crawler - A TLS scanning tool to perform large scale scans with the TLS-Scanner | ||
| * | ||
| * Copyright 2018-2023 Ruhr University Bochum, Paderborn University, and Hackmanit GmbH | ||
| * | ||
| * Licensed under Apache License, Version 2.0 | ||
| * http://www.apache.org/licenses/LICENSE-2.0.txt | ||
| */ | ||
| package de.rub.nds.crawler.core; | ||
|
|
||
| import java.util.concurrent.CompletableFuture; | ||
| import java.util.concurrent.Future; | ||
| import org.bson.Document; | ||
|
|
||
| /** | ||
| * Represents a scheduled scan that tracks progress and provides both partial and final results. | ||
| * | ||
| * <p>This class provides a clean abstraction for the scan lifecycle: | ||
| * | ||
| * <ul> | ||
| * <li>Check if the scan is complete via {@link #isComplete()} | ||
| * <li>Get the current result (partial or final) via {@link #getCurrentResult()} | ||
| * <li>Wait for the final result via {@link #getFinalResult()} | ||
| * </ul> | ||
| */ | ||
| public class ScheduledScan { | ||
|
|
||
| private volatile Document currentResult; | ||
| private final CompletableFuture<Document> finalResult = new CompletableFuture<>(); | ||
|
Comment on lines
+26
to
+29
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reasoning for wrapping the future instead of extending it?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also find the name to not capture the functionality. |
||
|
|
||
| /** | ||
| * Check if the scan has completed. | ||
| * | ||
| * @return true if the scan is complete, false if still in progress | ||
| */ | ||
| public boolean isComplete() { | ||
| return finalResult.isDone(); | ||
| } | ||
|
|
||
| /** | ||
| * Get the current result document. If the scan is still in progress, this returns the latest | ||
| * partial result. If the scan is complete, this returns the final result. | ||
| * | ||
| * @return The current result document, or null if no result is available yet | ||
| */ | ||
| public Document getCurrentResult() { | ||
| return currentResult; | ||
| } | ||
|
|
||
| /** | ||
| * Get a Future that will resolve to the final result when the scan completes. | ||
| * | ||
| * @return A Future containing the final scan result | ||
| */ | ||
| public Future<Document> getFinalResult() { | ||
| return finalResult; | ||
| } | ||
|
|
||
| /** | ||
| * Update the current result. This is called by the scan worker when new partial results are | ||
| * available. | ||
| * | ||
| * @param partialResult The updated partial result document | ||
| */ | ||
| public void updateResult(Document partialResult) { | ||
| this.currentResult = partialResult; | ||
| } | ||
|
|
||
| /** | ||
| * Mark the scan as complete with the final result. This will complete the Future and notify any | ||
| * waiting consumers. | ||
| * | ||
| * @param result The final scan result | ||
| */ | ||
| void complete(Document result) { | ||
| this.currentResult = result; | ||
| this.finalResult.complete(result); | ||
| } | ||
|
|
||
| /** | ||
| * Mark the scan as failed with an exception. | ||
| * | ||
| * @param exception The exception that caused the failure | ||
| */ | ||
| void completeExceptionally(Throwable exception) { | ||
| this.finalResult.completeExceptionally(exception); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dislike having an "out parameter". The only alternative I see would be a functional interface to feed the partial results into. But I guess that isn't that much of a difference.