Skip to content

Commit

Permalink
Add web crawling (#6)
Browse files Browse the repository at this point in the history
* Add CrawlView
* Adjust timeout config
  • Loading branch information
pacphi authored Oct 24, 2024
1 parent d2658f4 commit 9fbe30d
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 1 deletion.
6 changes: 6 additions & 0 deletions src/main/java/org/cftoolsuite/client/SanfordClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import java.util.List;

import org.cftoolsuite.domain.FileMetadata;
import org.cftoolsuite.domain.crawl.CrawlRequest;
import org.cftoolsuite.domain.crawl.CrawlResponse;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.core.io.Resource;
import org.springframework.http.MediaType;
Expand All @@ -11,6 +13,7 @@
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.multipart.MultipartFile;
Expand All @@ -21,6 +24,9 @@ public interface SanfordClient {
@PostMapping(value = "/api/files/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
ResponseEntity<FileMetadata> uploadFile(@RequestPart("fileName") MultipartFile file);

@PostMapping("/crawl")
public ResponseEntity<CrawlResponse> startCrawl(@RequestBody CrawlRequest crawlRequest);

@GetMapping("/api/files/chat")
public ResponseEntity<String> chat(@RequestParam("q") String message);

Expand Down
33 changes: 33 additions & 0 deletions src/main/java/org/cftoolsuite/domain/crawl/CrawlRequest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package org.cftoolsuite.domain.crawl;

import org.apache.commons.lang3.StringUtils;
import org.springframework.util.Assert;

public record CrawlRequest(
String rootDomain,
String[] seeds,
String storageFolder,
Integer maxDepthOfCrawling,
String includesRegexFilter,
Integer numberOfCrawlers
) {
public CrawlRequest {
Assert.hasText(rootDomain, "A root domain must be specified!");
Assert.isTrue(seeds != null && seeds.length >= 1, "At least one seed URL must be specified!");
String parentForStorageFolder = String.join(System.getProperty("file.separator"), System.getProperty("java.io.tmpdir"), "crawler4j");
if (StringUtils.isBlank(storageFolder)) {
storageFolder = parentForStorageFolder;
} else {
storageFolder = String.join(System.getProperty("file.separator"), parentForStorageFolder, storageFolder);
}
if (StringUtils.isBlank(includesRegexFilter)) {
includesRegexFilter = ".*(\\.(htm|html))$";
}
if (maxDepthOfCrawling == null || maxDepthOfCrawling <= 0) {
maxDepthOfCrawling = -1;
}
if (numberOfCrawlers == null || numberOfCrawlers <= 0) {
numberOfCrawlers = 3;
}
}
}
3 changes: 3 additions & 0 deletions src/main/java/org/cftoolsuite/domain/crawl/CrawlResponse.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package org.cftoolsuite.domain.crawl;

public record CrawlResponse(String id, String storageFolder, String result) {}
4 changes: 3 additions & 1 deletion src/main/java/org/cftoolsuite/ui/MainLayout.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.cftoolsuite.ui;

import org.cftoolsuite.ui.view.ChatView;
import org.cftoolsuite.ui.view.CrawlView;
import org.cftoolsuite.ui.view.DeleteView;
import org.cftoolsuite.ui.view.DownloadView;
import org.cftoolsuite.ui.view.HomeView;
Expand Down Expand Up @@ -36,13 +37,14 @@ public MainLayout() {
Tabs actionTabs = createTabs();

Tab uploadTab = createTab(VaadinIcon.UPLOAD.create(), "Upload documents", UploadView.class);
Tab crawlTab = createTab(VaadinIcon.SITEMAP.create(), "Crawl websites for documents", CrawlView.class);
Tab chatTab = createTab(VaadinIcon.CHAT.create(), "Chat with AI bot about documents", ChatView.class);
Tab listTab = createTab(VaadinIcon.LIST.create(), "List document metadata", ListView.class);
Tab searchTab = createTab(VaadinIcon.SEARCH.create(), "Search for document metadata", SearchView.class);
Tab summaryTab = createTab(VaadinIcon.BULLETS.create(), "Summarize a document", SummarizeView.class);
Tab downloadTab = createTab(VaadinIcon.DOWNLOAD.create(), "Download a document", DownloadView.class);
Tab deleteTab = createTab(VaadinIcon.TRASH.create(), "Delete a document", DeleteView.class);
actionTabs.add(uploadTab, chatTab, listTab, searchTab, summaryTab, downloadTab, deleteTab);
actionTabs.add(uploadTab, crawlTab, chatTab, listTab, searchTab, summaryTab, downloadTab, deleteTab);
accordion.add("Actions", actionTabs).addThemeVariants(DetailsVariant.REVERSE);

addToNavbar(true, homeTab, new DrawerToggle());
Expand Down
118 changes: 118 additions & 0 deletions src/main/java/org/cftoolsuite/ui/view/CrawlView.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package org.cftoolsuite.ui.view;

import java.util.Arrays;
import java.util.stream.Collectors;

import org.cftoolsuite.client.SanfordClient;
import org.cftoolsuite.domain.crawl.CrawlRequest;
import org.cftoolsuite.domain.crawl.CrawlResponse;
import org.cftoolsuite.ui.MainLayout;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;

import com.vaadin.flow.component.button.Button;
import com.vaadin.flow.component.html.H2;
import com.vaadin.flow.component.notification.NotificationVariant;
import com.vaadin.flow.component.orderedlayout.HorizontalLayout;
import com.vaadin.flow.component.textfield.TextArea;
import com.vaadin.flow.component.textfield.TextField;
import com.vaadin.flow.router.PageTitle;
import com.vaadin.flow.router.Route;

import jakarta.annotation.PostConstruct;

@PageTitle("sanford-ui » Crawl")
@Route(value = "crawl", layout = MainLayout.class)
public class CrawlView extends BaseView {

private static final Logger log = LoggerFactory.getLogger(CrawlView.class);

private TextField rootDomain;
private TextArea seeds;
private TextField includesRegexFilter;
private Button crawlButton;
private Button clearButton;
private HorizontalLayout buttons;

public CrawlView(SanfordClient sanfordClient) {
super(sanfordClient);
}

@PostConstruct
public void init() {
setAlignItems(Alignment.CENTER);
setJustifyContentMode(JustifyContentMode.CENTER);
add(getLogoImage());
setupUI();
}

@Override
protected void setupUI() {
this.rootDomain = new TextField("Root domain");
this.rootDomain.setRequired(true);
this.rootDomain.setHelperText("The root domain of the website you want to crawl which may also include sub-paths.");
this.seeds = new TextArea("Seeds");
this.seeds.setRequired(true);
this.seeds.setHelperText("A comma-separated list of seeds from which to execute crawling from. Each seed should be an additional sub-path from the root domain. Links found within each file found will be crawled so long as they match filter. The crawling algorithm is also constrained to a maximum depth of 5.");
this.includesRegexFilter = new TextField("Regex-based includes filter");
this.includesRegexFilter.setHelperText("A regex-based filter that will impact what files are crawled based upon file extensions. If left blank, the default will be .*(\\\\.(htm|html))$ .");
this.crawlButton = new Button("Crawl");
this.clearButton = new Button("Clear");
this.buttons = new HorizontalLayout();

buttons.add(crawlButton, clearButton);

buttons.setAlignItems(Alignment.CENTER);
buttons.setJustifyContentMode(JustifyContentMode.CENTER);
crawlButton.addClickListener(event -> crawlRequest());
clearButton.addClickListener(event -> clearAllFields());

add(
new H2("Crawl a website"),
rootDomain,
seeds,
includesRegexFilter,
buttons
);

autoSizeFields();
}

protected void crawlRequest() {
try {
CrawlRequest request =
new CrawlRequest(rootDomain.getValue(), convertToArray(seeds.getValue()), null, null, includesRegexFilter.getValue(), null);
ResponseEntity<CrawlResponse> response = sanfordClient.startCrawl(request);
if (response.getStatusCode().is2xxSuccessful() && response.getBody() != null) {
showNotification("Completed crawling website", NotificationVariant.LUMO_SUCCESS);
} else {
showNotification("Error crawling website", NotificationVariant.LUMO_ERROR);
}
} catch (Exception e) {
log.error("Error crawling website", e);
showNotification("Error crawling website: " + e.getMessage(), NotificationVariant.LUMO_ERROR);
}
}

protected String[] convertToArray(String commaSeparatedString) {
return Arrays.stream(commaSeparatedString.split(","))
.map(String::trim)
.filter(s -> !s.isEmpty())
.collect(Collectors.toSet())
.toArray(new String[0]);
}

@Override
protected void clearAllFields() {
rootDomain.clear();
seeds.clear();
includesRegexFilter.clear();
}

private void autoSizeFields() {
rootDomain.setWidth("480px");
seeds.setWidth("480px");
includesRegexFilter.setWidth("240px");
}
}
9 changes: 9 additions & 0 deletions src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@ spring:
application:
name: sanford-ui

mvc:
async:
request-timeout: ${SPRING_MVC_ASYNC_REQUEST_TIMEOUT:-1}

servlet:
multipart:
max-file-size: 100MB
max-request-size: 100MB

threads:
virtual:
enabled: true
Expand Down

0 comments on commit 9fbe30d

Please sign in to comment.