diff --git a/src/main/java/org/cftoolsuite/client/SanfordClient.java b/src/main/java/org/cftoolsuite/client/SanfordClient.java index d3865ca..10086b1 100644 --- a/src/main/java/org/cftoolsuite/client/SanfordClient.java +++ b/src/main/java/org/cftoolsuite/client/SanfordClient.java @@ -3,6 +3,8 @@ import java.util.List; import org.cftoolsuite.domain.FileMetadata; +import org.cftoolsuite.domain.crawl.CrawlRequest; +import org.cftoolsuite.domain.crawl.CrawlResponse; import org.springframework.cloud.openfeign.FeignClient; import org.springframework.core.io.Resource; import org.springframework.http.MediaType; @@ -11,6 +13,7 @@ import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RequestPart; import org.springframework.web.multipart.MultipartFile; @@ -21,6 +24,9 @@ public interface SanfordClient { @PostMapping(value = "/api/files/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) ResponseEntity uploadFile(@RequestPart("fileName") MultipartFile file); + @PostMapping("/crawl") + public ResponseEntity startCrawl(@RequestBody CrawlRequest crawlRequest); + @GetMapping("/api/files/chat") public ResponseEntity chat(@RequestParam("q") String message); diff --git a/src/main/java/org/cftoolsuite/domain/crawl/CrawlRequest.java b/src/main/java/org/cftoolsuite/domain/crawl/CrawlRequest.java new file mode 100644 index 0000000..ad914d9 --- /dev/null +++ b/src/main/java/org/cftoolsuite/domain/crawl/CrawlRequest.java @@ -0,0 +1,33 @@ +package org.cftoolsuite.domain.crawl; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.util.Assert; + +public record CrawlRequest( + String rootDomain, + String[] seeds, + String storageFolder, + Integer maxDepthOfCrawling, + String includesRegexFilter, + Integer numberOfCrawlers +) { + public CrawlRequest { + Assert.hasText(rootDomain, "A root domain must be specified!"); + Assert.isTrue(seeds != null && seeds.length >= 1, "At least one seed URL must be specified!"); + String parentForStorageFolder = String.join(System.getProperty("file.separator"), System.getProperty("java.io.tmpdir"), "crawler4j"); + if (StringUtils.isBlank(storageFolder)) { + storageFolder = parentForStorageFolder; + } else { + storageFolder = String.join(System.getProperty("file.separator"), parentForStorageFolder, storageFolder); + } + if (StringUtils.isBlank(includesRegexFilter)) { + includesRegexFilter = ".*(\\.(htm|html))$"; + } + if (maxDepthOfCrawling == null || maxDepthOfCrawling <= 0) { + maxDepthOfCrawling = -1; + } + if (numberOfCrawlers == null || numberOfCrawlers <= 0) { + numberOfCrawlers = 3; + } + } +} diff --git a/src/main/java/org/cftoolsuite/domain/crawl/CrawlResponse.java b/src/main/java/org/cftoolsuite/domain/crawl/CrawlResponse.java new file mode 100644 index 0000000..d90a535 --- /dev/null +++ b/src/main/java/org/cftoolsuite/domain/crawl/CrawlResponse.java @@ -0,0 +1,3 @@ +package org.cftoolsuite.domain.crawl; + +public record CrawlResponse(String id, String storageFolder, String result) {} diff --git a/src/main/java/org/cftoolsuite/ui/MainLayout.java b/src/main/java/org/cftoolsuite/ui/MainLayout.java index 34e257a..ea0d05d 100644 --- a/src/main/java/org/cftoolsuite/ui/MainLayout.java +++ b/src/main/java/org/cftoolsuite/ui/MainLayout.java @@ -1,6 +1,7 @@ package org.cftoolsuite.ui; import org.cftoolsuite.ui.view.ChatView; +import org.cftoolsuite.ui.view.CrawlView; import org.cftoolsuite.ui.view.DeleteView; import org.cftoolsuite.ui.view.DownloadView; import org.cftoolsuite.ui.view.HomeView; @@ -36,13 +37,14 @@ public MainLayout() { Tabs actionTabs = createTabs(); Tab uploadTab = createTab(VaadinIcon.UPLOAD.create(), "Upload documents", UploadView.class); + Tab crawlTab = createTab(VaadinIcon.SITEMAP.create(), "Crawl websites for documents", CrawlView.class); Tab chatTab = createTab(VaadinIcon.CHAT.create(), "Chat with AI bot about documents", ChatView.class); Tab listTab = createTab(VaadinIcon.LIST.create(), "List document metadata", ListView.class); Tab searchTab = createTab(VaadinIcon.SEARCH.create(), "Search for document metadata", SearchView.class); Tab summaryTab = createTab(VaadinIcon.BULLETS.create(), "Summarize a document", SummarizeView.class); Tab downloadTab = createTab(VaadinIcon.DOWNLOAD.create(), "Download a document", DownloadView.class); Tab deleteTab = createTab(VaadinIcon.TRASH.create(), "Delete a document", DeleteView.class); - actionTabs.add(uploadTab, chatTab, listTab, searchTab, summaryTab, downloadTab, deleteTab); + actionTabs.add(uploadTab, crawlTab, chatTab, listTab, searchTab, summaryTab, downloadTab, deleteTab); accordion.add("Actions", actionTabs).addThemeVariants(DetailsVariant.REVERSE); addToNavbar(true, homeTab, new DrawerToggle()); diff --git a/src/main/java/org/cftoolsuite/ui/view/CrawlView.java b/src/main/java/org/cftoolsuite/ui/view/CrawlView.java new file mode 100644 index 0000000..90f242e --- /dev/null +++ b/src/main/java/org/cftoolsuite/ui/view/CrawlView.java @@ -0,0 +1,118 @@ +package org.cftoolsuite.ui.view; + +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.cftoolsuite.client.SanfordClient; +import org.cftoolsuite.domain.crawl.CrawlRequest; +import org.cftoolsuite.domain.crawl.CrawlResponse; +import org.cftoolsuite.ui.MainLayout; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.http.ResponseEntity; + +import com.vaadin.flow.component.button.Button; +import com.vaadin.flow.component.html.H2; +import com.vaadin.flow.component.notification.NotificationVariant; +import com.vaadin.flow.component.orderedlayout.HorizontalLayout; +import com.vaadin.flow.component.textfield.TextArea; +import com.vaadin.flow.component.textfield.TextField; +import com.vaadin.flow.router.PageTitle; +import com.vaadin.flow.router.Route; + +import jakarta.annotation.PostConstruct; + +@PageTitle("sanford-ui ยป Crawl") +@Route(value = "crawl", layout = MainLayout.class) +public class CrawlView extends BaseView { + + private static final Logger log = LoggerFactory.getLogger(CrawlView.class); + + private TextField rootDomain; + private TextArea seeds; + private TextField includesRegexFilter; + private Button crawlButton; + private Button clearButton; + private HorizontalLayout buttons; + + public CrawlView(SanfordClient sanfordClient) { + super(sanfordClient); + } + + @PostConstruct + public void init() { + setAlignItems(Alignment.CENTER); + setJustifyContentMode(JustifyContentMode.CENTER); + add(getLogoImage()); + setupUI(); + } + + @Override + protected void setupUI() { + this.rootDomain = new TextField("Root domain"); + this.rootDomain.setRequired(true); + this.rootDomain.setHelperText("The root domain of the website you want to crawl which may also include sub-paths."); + this.seeds = new TextArea("Seeds"); + this.seeds.setRequired(true); + this.seeds.setHelperText("A comma-separated list of seeds from which to execute crawling from. Each seed should be an additional sub-path from the root domain. Links found within each file found will be crawled so long as they match filter. The crawling algorithm is also constrained to a maximum depth of 5."); + this.includesRegexFilter = new TextField("Regex-based includes filter"); + this.includesRegexFilter.setHelperText("A regex-based filter that will impact what files are crawled based upon file extensions. If left blank, the default will be .*(\\\\.(htm|html))$ ."); + this.crawlButton = new Button("Crawl"); + this.clearButton = new Button("Clear"); + this.buttons = new HorizontalLayout(); + + buttons.add(crawlButton, clearButton); + + buttons.setAlignItems(Alignment.CENTER); + buttons.setJustifyContentMode(JustifyContentMode.CENTER); + crawlButton.addClickListener(event -> crawlRequest()); + clearButton.addClickListener(event -> clearAllFields()); + + add( + new H2("Crawl a website"), + rootDomain, + seeds, + includesRegexFilter, + buttons + ); + + autoSizeFields(); + } + + protected void crawlRequest() { + try { + CrawlRequest request = + new CrawlRequest(rootDomain.getValue(), convertToArray(seeds.getValue()), null, null, includesRegexFilter.getValue(), null); + ResponseEntity response = sanfordClient.startCrawl(request); + if (response.getStatusCode().is2xxSuccessful() && response.getBody() != null) { + showNotification("Completed crawling website", NotificationVariant.LUMO_SUCCESS); + } else { + showNotification("Error crawling website", NotificationVariant.LUMO_ERROR); + } + } catch (Exception e) { + log.error("Error crawling website", e); + showNotification("Error crawling website: " + e.getMessage(), NotificationVariant.LUMO_ERROR); + } + } + + protected String[] convertToArray(String commaSeparatedString) { + return Arrays.stream(commaSeparatedString.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toSet()) + .toArray(new String[0]); + } + + @Override + protected void clearAllFields() { + rootDomain.clear(); + seeds.clear(); + includesRegexFilter.clear(); + } + + private void autoSizeFields() { + rootDomain.setWidth("480px"); + seeds.setWidth("480px"); + includesRegexFilter.setWidth("240px"); + } +} \ No newline at end of file diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index b4d56c7..69f79b3 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -2,6 +2,15 @@ spring: application: name: sanford-ui + mvc: + async: + request-timeout: ${SPRING_MVC_ASYNC_REQUEST_TIMEOUT:-1} + + servlet: + multipart: + max-file-size: 100MB + max-request-size: 100MB + threads: virtual: enabled: true