diff --git a/docs/.vuepress/dist/404.html b/docs/.vuepress/dist/404.html index da5ff84..66a5d30 100644 --- a/docs/.vuepress/dist/404.html +++ b/docs/.vuepress/dist/404.html @@ -38,10 +38,10 @@ } - + -

404

Page not found

That’s a Four-Oh-Four.

As you can see, the service mongo is removed and MongoDB-related connection environment variables ( e.g. CRAWLAB_MONGO_HOST, CRAWLAB_MONGO_PORT) are changed to those of external MongoDB. You can leave some environment variables empty if you don't need them.

`,3);function R(D,B){const e=o("ExternalLinkIcon"),t=o("Mermaid"),p=o("RouterLink");return i(),r("div",null,[m,n("p",null,[s("Docker is the most convenient and easiest way to install and deploy Crawlab. If you are not familiar with Docker, you can refer to "),n("a",k,[s("Docker Official Site"),a(e)]),s(" and install it on your local machine. Make sure you have installed Docker before proceeding any further steps.")]),v,b,n("ol",null,[n("li",null,[s("Install "),n("a",g,[s("Docker"),a(e)]),s(" and "),n("a",h,[s("Docker-Compose"),a(e)])]),y,_,w]),q,A,a(t,{id:"mermaid-42",code:"eJxVzE0LgkAQgOF7v2Kga140KiQ6iFe71E087Me4K8zuyDoh/vsUgvD8Prw98Wy8SgLv6gAwfbRLavTwEhWtIo6YPdki1DgSLwGjrAogtI2aBBNssYN7lj0guLbh6LiuutVgtNtPFkII0A9E5fGsb4XOT4aJUzn7QfBP3M9criYv1M58AaVPNew="}),n("p",null,[O,s(" is similar to the configuration in "),a(p,{to:"/en/guide/quick-start.html"},{default:u(()=>[s("Quick Start")]),_:1}),s(", and it is normally for demo purpose or managing a small number of crawlers. In SND, all Docker containers including Crawlab and MongoDB are in only a single machine, i.e. Master Node (see diagram above).")]),N,a(t,{id:"mermaid-56",code:"eJyNzrEKwjAUheHdp7jgaocmIlLEydE6uDiUDml7a4K3TUlTQt/eBoMQUOh6/m84LWlXS2EsXO8bgHGqnkYMEvKJrEpuukG44EB67rC3CwDoilyMFg34WMIpSc7g0uKhzSuMkJYfGSKLIosjjyL3EfvGX7Ez4cJaRZRt99WRV2xXa9Imc1JZ/BKXBoMHwXj927AVhv83b4dfXRo="}),M,a(t,{id:"mermaid-100",code:"eJyNzsELgjAUx/F7f8WDrnlwiwqJDuIxO0TQQTzM+dTR9MmciP99WiMQgrr+vp+xV2gaZCWMhfN1BdD1WWlEW0Hca6u8C+UIEbaaxhobOwGAXBmUVlEDt/A11EksOosGZp3C0fNOMPjJnczDjeCnb+kiW0S2jHwRuYtlElNTUhS6D+ppxiafT7ajxul1obQO1tvswDO2kaTJBEOlLH7I4DuDO8G4/G7YH4b/NnXpzG4vGRcL8wR5BXQH"}),f])}const W=c(d,[["render",R],["__file","docker.html.vue"]]);export{W as default}; diff --git a/docs/.vuepress/dist/assets/docker.html-c541ffa1.js b/docs/.vuepress/dist/assets/docker.html-c541ffa1.js deleted file mode 100644 index 5e65db3..0000000 --- a/docs/.vuepress/dist/assets/docker.html-c541ffa1.js +++ /dev/null @@ -1 +0,0 @@ -const e=JSON.parse('{"key":"v-296c4864","path":"/en/guide/installation/docker.html","title":"Installation: Docker","lang":"en-US","frontmatter":{"description":"Docker is the most convenient and easiest way to install and deploy Crawlab. If you are not familiar with Docker, you can refer to Docker Official Site (https://www.docker.com/)...","head":[["link",{"rel":"alternate","hreflang":"zh-cn","href":"https://docs.crawlab.cn/zh/guide/installation/docker.html"}],["meta",{"property":"og:url","content":"https://docs.crawlab.cn/en/guide/installation/docker.html"}],["meta",{"property":"og:title","content":"Installation: Docker"}],["meta",{"property":"og:description","content":"Docker is the most convenient and easiest way to install and deploy Crawlab. If you are not familiar with Docker, you can refer to Docker Official Site (https://www.docker.com/)..."}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:locale","content":"en-US"}],["meta",{"property":"og:locale:alternate","content":"zh-CN"}],["meta",{"property":"og:updated_time","content":"2023-04-05T09:39:57.000Z"}],["meta",{"property":"article:author","content":"Marvin Zhang"}],["meta",{"property":"article:modified_time","content":"2023-04-05T09:39:57.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"Installation: Docker\\",\\"image\\":[\\"\\"],\\"dateModified\\":\\"2023-04-05T09:39:57.000Z\\",\\"author\\":[{\\"@type\\":\\"Person\\",\\"name\\":\\"Marvin Zhang\\"}]}"]]},"headers":[{"level":2,"title":"Main Process","slug":"main-process","link":"#main-process","children":[]},{"level":2,"title":"Standalone-Node Deployment","slug":"standalone-node-deployment","link":"#standalone-node-deployment","children":[]},{"level":2,"title":"Multi-Node Deployment","slug":"multi-node-deployment","link":"#multi-node-deployment","children":[{"level":3,"title":"Set up Master Node","slug":"set-up-master-node","link":"#set-up-master-node","children":[]},{"level":3,"title":"Set up Worker Nodes","slug":"set-up-worker-nodes","link":"#set-up-worker-nodes","children":[]}]},{"level":2,"title":"External MongoDB","slug":"external-mongodb","link":"#external-mongodb","children":[]}],"git":{"createdTime":1636800408000,"updatedTime":1680687597000,"contributors":[{"name":"Marvin Zhang","email":"tikazyq@163.com","commits":8}]},"readingTime":{"minutes":3.45,"words":1034},"filePathRelative":"en/guide/installation/docker.md","localizedDate":"November 13, 2021","autoDesc":true}');export{e as data}; diff --git a/docs/.vuepress/dist/assets/index.html-1641eeda.js b/docs/.vuepress/dist/assets/index.html-1641eeda.js new file mode 100644 index 0000000..0f57ee8 --- /dev/null +++ b/docs/.vuepress/dist/assets/index.html-1641eeda.js @@ -0,0 +1 @@ +import{_ as i,E as n,Z as l,$ as c,a0 as a,a3 as e,a1 as r,a2 as o,a4 as d}from"./framework-64cb0dab.js";const h={},u=d('

FAQ

What is Crawlab?

Crawlab is an open-source web crawler management platform. Its design goal is to help users easily create, manage, and monitor web crawler tasks. Crawlab provides a user-friendly graphical interface that allows users to configure crawler tasks, set crawling rules, monitor the crawling status, and view the crawling results.

You can check the Introduction section for more information.

Why can Crawlab execute crawlers written in different programming languages and frameworks?

Crawlab executes crawler tasks based on Shell commands. Therefore, theoretically, any crawler that can be run using Shell commands can be executed in Crawlab if the environment allows.

The Execution Command and Parameters in the crawler are concatenated to form the actual Shell command for the crawler task. For example, if the Execute Command of a certain crawler is python main.py and the parameter is spider1, then the Shell command for the crawler task will be python main.py spider1.

Why does Crawlab always pull version v0.6.0 instead of the latest version?

',8),p={href:"https://mirror.ccs.tencentyun.com",target:"_blank",rel:"noopener noreferrer"},w=a("h3",{id:"does-crawlab-support-scrapy",tabindex:"-1"},[a("a",{class:"header-anchor",href:"#does-crawlab-support-scrapy","aria-hidden":"true"},"#"),e(" Does Crawlab support Scrapy?")],-1),m=a("p",null,[e("Yes, Crawlab supports Scrapy, and it has a built-in pipeline that can be used. You just need to add "),a("code",null,"crawlab.CrawlabPipeline"),e(" to the "),a("code",null,"ITEM_PIPELINS"),e(" in the "),a("code",null,"settings.py"),e(" file to integrate it.")],-1),f=a("h3",{id:"does-crawlab-support-selenium",tabindex:"-1"},[a("a",{class:"header-anchor",href:"#does-crawlab-support-selenium","aria-hidden":"true"},"#"),e(" Does Crawlab support Selenium?")],-1);function b(g,_){const s=n("ExternalLinkIcon"),t=n("RouterLink");return l(),c("div",null,[u,a("p",null,[e("For users in China, it is highly possible that you have configured the Aliyun mirror proxy. Please use other mirror proxies, such as "),a("a",p,[e("Tencent Cloud mirror proxy"),r(s)]),e(".")]),w,m,a("p",null,[e("For more details, please refer to "),r(t,{to:"/en/guide/spider/integration.html"},{default:o(()=>[e("Spider Integration")]),_:1}),e(".")]),f,a("p",null,[e("Yes, Crawlab supports Selenium for web scraping. For more details, please refer to "),r(t,{to:"/en/guide/spider/selenium.html"},{default:o(()=>[e("Selenium Spider Integration")]),_:1}),e(".")])])}const x=i(h,[["render",b],["__file","index.html.vue"]]);export{x as default}; diff --git a/docs/.vuepress/dist/assets/index.html-454a7ac4.js b/docs/.vuepress/dist/assets/index.html-454a7ac4.js new file mode 100644 index 0000000..c99e753 --- /dev/null +++ b/docs/.vuepress/dist/assets/index.html-454a7ac4.js @@ -0,0 +1 @@ +import{_ as d,E as l,Z as o,$ as i,a0 as a,a3 as e,a1 as r,a2 as c,a4 as h}from"./framework-64cb0dab.js";const s={},p=h('

FAQ

Crawlab 是什么?

Crawlab 是一个开源的网络爬虫管理平台。它的设计目标是帮助用户更轻松地创建、管理和监控网络爬虫任务。Crawlab 提供了一个用户友好的图形界面,使用户可以通过简单的操作来配置爬虫任务、设置爬取规则、监视爬取状态以及查看爬取结果。

您可以查看 介绍章节 进行更多了解。

Crawlab 为什么可以执行不同编程语言和框架的爬虫?

Crawlab 执行爬虫任务是基于 Shell 命令。因此,如果环境允许,理论上能支持 Shell 命令的爬虫都可以在 Crawlab 中运行。

爬虫中的 执行命令参数 拼接起来就是爬虫任务实际的 Shell 命令。例如,某个爬虫的 执行命令python main.py ,参数为 spider1,那么爬虫任务的 Shell 命令就为 python main.py

为什么拉取 Crawlab 总是 v0.6.0 版本,而不是最新版本?

',8),u={href:"https://mirror.ccs.tencentyun.com",target:"_blank",rel:"noopener noreferrer"},b=a("h3",{id:"crawlab-支持-scrapy-吗",tabindex:"-1"},[a("a",{class:"header-anchor",href:"#crawlab-支持-scrapy-吗","aria-hidden":"true"},"#"),e(" Crawlab 支持 Scrapy 吗?")],-1),_=a("p",null,[e("Crawlab 支持 Scrapy,而且有内置的 Pipeline 可以使用,只需要在 "),a("code",null,"settings.py"),e(" 的 "),a("code",null,"ITEM_PIPELINS"),e(" 中加入 "),a("code",null,"crawlab.CrawlabPipeline"),e(" 即可集成。")],-1),w=a("h3",{id:"crawlab-支持-selenium-吗",tabindex:"-1"},[a("a",{class:"header-anchor",href:"#crawlab-支持-selenium-吗","aria-hidden":"true"},"#"),e(" Crawlab 支持 Selenium 吗?")],-1);function m(f,x){const t=l("ExternalLinkIcon"),n=l("RouterLink");return o(),i("div",null,[p,a("p",null,[e("对于中国国内用户,很有可能您配置了阿里云镜像代理,请使用其他镜像代理,例如"),a("a",u,[e("腾讯云镜像代理"),r(t)]),e("。")]),b,_,a("p",null,[e("详情请参考 "),r(n,{to:"/zh/guide/spider/integration.html"},{default:c(()=>[e("爬虫集成")]),_:1}),e("。")]),w,a("p",null,[e("Crawlab 支持 Selenium 爬虫,详情参考 "),r(n,{to:"/zh/guide/spider/selenium.html"},{default:c(()=>[e("Selenium 爬虫集成")]),_:1}),e("。")])])}const S=d(s,[["render",m],["__file","index.html.vue"]]);export{S as default}; diff --git a/docs/.vuepress/dist/assets/installation.html-35c213fc.js b/docs/.vuepress/dist/assets/installation.html-41eb0872.js similarity index 86% rename from docs/.vuepress/dist/assets/installation.html-35c213fc.js rename to docs/.vuepress/dist/assets/installation.html-41eb0872.js index bffd081..8db1aad 100644 --- a/docs/.vuepress/dist/assets/installation.html-35c213fc.js +++ b/docs/.vuepress/dist/assets/installation.html-41eb0872.js @@ -16,6 +16,7 @@ import{_ as p,E as l,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew volumes: - "/opt/crawlab/.crawlab/master:/root/.crawlab" # persistent crawlab metadata - "/opt/crawlab/master:/data" # persistent crawlab data + - "/var/crawlab/log:/var/logs/crawlab" # log persistent ports: - "8080:8080" # exposed api port depends_on: @@ -31,7 +32,7 @@ import{_ as p,E as l,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew - "/opt/crawlab/mongo/data/db:/data/db" # persistent mongo data ports: - "27017:27017" # expose mongo port to host machine -

Then, execute docker-compose up -d and navigate to http://<your_ip>:8080 in the browser to start using Crawlab.

Multi-Node Deployment

`,5),q=e(`

Multi-Node Deployment (MND) is normally used in production environment, where a cluster consisted of a Master Node and multiple Worker Nodes is deployed. Master Node is connected by Worker Nodes, and it serves as the central control system in the cluster.

The configuration for MND is more complex than SND, but you can follow the guidelines below to set up a small cluster, which would be quite straightforward.

Set up Master Node

Create docker-compose.yml in Master Node and enter the content below. Then start by executing docker-compose up -d.

# master node
+

Then, execute docker-compose up -d and navigate to http://<your_ip>:8080 in the browser to start using Crawlab.

Multi-Node Deployment

`,5),q=e(`

Multi-Node Deployment (MND) is normally used in production environment, where a cluster consisted of a Master Node and multiple Worker Nodes is deployed. Master Node is connected by Worker Nodes, and it serves as the central control system in the cluster.

The configuration for MND is more complex than SND, but you can follow the guidelines below to set up a small cluster, which would be quite straightforward.

Set up Master Node

Create docker-compose.yml in Master Node and enter the content below. Then start by executing docker-compose up -d.

# master node
 version: '3.3'
 services:
   master:
@@ -50,6 +51,7 @@ import{_ as p,E as l,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
     volumes:
       - "/opt/crawlab/.crawlab/master:/root/.crawlab"  # persistent crawlab metadata
       - "/opt/crawlab/master:/data"  # persistent crawlab data
+      - "/var/crawlab/log:/var/logs/crawlab" # log persistent 
     ports:
       - "8080:8080"  # exposed api port
       - "9666:9666"  # exposed grpc port
@@ -66,7 +68,7 @@ import{_ as p,E as l,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
       - "/opt/crawlab/mongo/data/db:/data/db"  # persistent mongo data
     ports:
       - "27017:27017"  # expose mongo port to host machine
-

Set up Worker Nodes

Create docker-compose.yml in each Worker Node and enter the content below. Then start by executing docker-compose up -d.

# worker node
+

Set up Worker Nodes

Create docker-compose.yml in each Worker Node and enter the content below. Then start by executing docker-compose up -d.

# worker node
 version: '3.3'
 services:
   worker:
@@ -103,7 +105,8 @@ import{_ as p,E as l,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
     volumes:
       - "/opt/crawlab/.crawlab/master:/root/.crawlab"  # persistent crawlab metadata
       - "/opt/crawlab/master:/data"  # persistent crawlab data
+      - "/var/crawlab/log:/var/logs/crawlab" # log persistent 
     ports:
       - "8080:8080"  # exposed api port
       - "9666:9666"  # exposed grpc port
-

As you can see, the service mongo is removed and MongoDB-related connection environment variables ( e.g. CRAWLAB_MONGO_HOST, CRAWLAB_MONGO_PORT) are changed to those of external MongoDB. You can leave some environment variables empty if you don't need them.

`,3);function O(N,f){const t=l("ExternalLinkIcon"),o=l("Mermaid");return c(),i("div",null,[u,n("p",null,[s("Crawlab Pro is distributed via "),n("a",d,[s("Docker"),a(t)]),s(". You can install it on any platform that supports Docker.")]),m,n("ol",null,[n("li",null,[s("Install "),n("a",k,[s("Docker"),a(t)]),s(" and "),n("a",v,[s("Docker-Compose"),a(t)])]),b,g,h]),y,_,a(o,{id:"mermaid-67",code:"eJxVzE0LgkAQgOF7v2Kga140KiQ6iFe71E087Me4K8zuyDoh/vsUgvD8Prw98Wy8SgLv6gAwfbRLavTwEhWtIo6YPdki1DgSLwGjrAogtI2aBBNssYN7lj0guLbh6LiuutVgtNtPFkII0A9E5fGsb4XOT4aJUzn7QfBP3M9criYv1M58AaVPNew="}),w,a(o,{id:"mermaid-81",code:"eJyNzrEKwjAUheHdp7jgaocmIlLEydE6uDiUDml7a4K3TUlTQt/eBoMQUOh6/m84LWlXS2EsXO8bgHGqnkYMEvKJrEpuukG44EB67rC3CwDoilyMFg34WMIpSc7g0uKhzSuMkJYfGSKLIosjjyL3EfvGX7Ez4cJaRZRt99WRV2xXa9Imc1JZ/BKXBoMHwXj927AVhv83b4dfXRo="}),q,a(o,{id:"mermaid-125",code:"eJyNzsELgjAUx/F7f8WDrnlwiwqJDuIxO0TQQTzM+dTR9MmciP99WiMQgrr+vp+xV2gaZCWMhfN1BdD1WWlEW0Hca6u8C+UIEbaaxhobOwGAXBmUVlEDt/A11EksOosGZp3C0fNOMPjJnczDjeCnb+kiW0S2jHwRuYtlElNTUhS6D+ppxiafT7ajxul1obQO1tvswDO2kaTJBEOlLH7I4DuDO8G4/G7YH4b/NnXpzG4vGRcL8wR5BXQH"}),A])}const R=p(r,[["render",O],["__file","installation.html.vue"]]);export{R as default}; +

As you can see, the service mongo is removed and MongoDB-related connection environment variables ( e.g. CRAWLAB_MONGO_HOST, CRAWLAB_MONGO_PORT) are changed to those of external MongoDB. You can leave some environment variables empty if you don't need them.

`,3);function O(N,f){const t=l("ExternalLinkIcon"),o=l("Mermaid");return c(),i("div",null,[u,n("p",null,[s("Crawlab Pro is distributed via "),n("a",d,[s("Docker"),a(t)]),s(". You can install it on any platform that supports Docker.")]),m,n("ol",null,[n("li",null,[s("Install "),n("a",k,[s("Docker"),a(t)]),s(" and "),n("a",v,[s("Docker-Compose"),a(t)])]),b,g,h]),y,_,a(o,{id:"mermaid-67",code:"eJxVzE0LgkAQgOF7v2Kga140KiQ6iFe71E087Me4K8zuyDoh/vsUgvD8Prw98Wy8SgLv6gAwfbRLavTwEhWtIo6YPdki1DgSLwGjrAogtI2aBBNssYN7lj0guLbh6LiuutVgtNtPFkII0A9E5fGsb4XOT4aJUzn7QfBP3M9criYv1M58AaVPNew="}),w,a(o,{id:"mermaid-81",code:"eJyNzrEKwjAUheHdp7jgaocmIlLEydE6uDiUDml7a4K3TUlTQt/eBoMQUOh6/m84LWlXS2EsXO8bgHGqnkYMEvKJrEpuukG44EB67rC3CwDoilyMFg34WMIpSc7g0uKhzSuMkJYfGSKLIosjjyL3EfvGX7Ez4cJaRZRt99WRV2xXa9Imc1JZ/BKXBoMHwXj927AVhv83b4dfXRo="}),q,a(o,{id:"mermaid-125",code:"eJyNzsELgjAUx/F7f8WDrnlwiwqJDuIxO0TQQTzM+dTR9MmciP99WiMQgrr+vp+xV2gaZCWMhfN1BdD1WWlEW0Hca6u8C+UIEbaaxhobOwGAXBmUVlEDt/A11EksOosGZp3C0fNOMPjJnczDjeCnb+kiW0S2jHwRuYtlElNTUhS6D+ppxiafT7ajxul1obQO1tvswDO2kaTJBEOlLH7I4DuDO8G4/G7YH4b/NnXpzG4vGRcL8wR5BXQH"}),A])}const R=p(r,[["render",O],["__file","installation.html.vue"]]);export{R as default}; diff --git a/docs/.vuepress/dist/assets/installation.html-5ec3535e.js b/docs/.vuepress/dist/assets/installation.html-5ec3535e.js deleted file mode 100644 index 3d8535d..0000000 --- a/docs/.vuepress/dist/assets/installation.html-5ec3535e.js +++ /dev/null @@ -1 +0,0 @@ -const e=JSON.parse(`{"key":"v-980061e0","path":"/en/pro/installation.html","title":"Installation","lang":"en-US","frontmatter":{"description":"Crawlab Pro is distributed via Docker (https://www.docker.com/). You can install it on any platform that supports Docker. Prerequisites If you don't know how to set up Docker en...","head":[["link",{"rel":"alternate","hreflang":"zh-cn","href":"https://docs.crawlab.cn/zh/pro/installation.html"}],["meta",{"property":"og:url","content":"https://docs.crawlab.cn/en/pro/installation.html"}],["meta",{"property":"og:title","content":"Installation"}],["meta",{"property":"og:description","content":"Crawlab Pro is distributed via Docker (https://www.docker.com/). You can install it on any platform that supports Docker. Prerequisites If you don't know how to set up Docker en..."}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:locale","content":"en-US"}],["meta",{"property":"og:locale:alternate","content":"zh-CN"}],["meta",{"property":"og:updated_time","content":"2023-04-05T09:39:57.000Z"}],["meta",{"property":"article:author","content":"Marvin Zhang"}],["meta",{"property":"article:modified_time","content":"2023-04-05T09:39:57.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"Installation\\",\\"image\\":[\\"\\"],\\"dateModified\\":\\"2023-04-05T09:39:57.000Z\\",\\"author\\":[{\\"@type\\":\\"Person\\",\\"name\\":\\"Marvin Zhang\\"}]}"]]},"headers":[{"level":2,"title":"Prerequisites","slug":"prerequisites","link":"#prerequisites","children":[]},{"level":2,"title":"Main Process","slug":"main-process","link":"#main-process","children":[]},{"level":2,"title":"Standalone-Node Deployment","slug":"standalone-node-deployment","link":"#standalone-node-deployment","children":[]},{"level":2,"title":"Multi-Node Deployment","slug":"multi-node-deployment","link":"#multi-node-deployment","children":[{"level":3,"title":"Set up Master Node","slug":"set-up-master-node","link":"#set-up-master-node","children":[]},{"level":3,"title":"Set up Worker Nodes","slug":"set-up-worker-nodes","link":"#set-up-worker-nodes","children":[]}]},{"level":2,"title":"External MongoDB","slug":"external-mongodb","link":"#external-mongodb","children":[]}],"git":{"createdTime":1665544716000,"updatedTime":1680687597000,"contributors":[{"name":"Marvin Zhang","email":"tikazyq@163.com","commits":4}]},"readingTime":{"minutes":3.76,"words":1129},"filePathRelative":"en/pro/installation.md","localizedDate":"October 12, 2022","autoDesc":true}`);export{e as data}; diff --git a/docs/.vuepress/dist/assets/installation.html-720e4239.js b/docs/.vuepress/dist/assets/installation.html-720e4239.js deleted file mode 100644 index d19a5a7..0000000 --- a/docs/.vuepress/dist/assets/installation.html-720e4239.js +++ /dev/null @@ -1 +0,0 @@ -const e=JSON.parse('{"key":"v-ba122056","path":"/zh/pro/installation.html","title":"安装","lang":"zh-CN","frontmatter":{"description":"Crawlab 专业版通过 Docker 镜像的方式提供,您可以在任何支持 Docker 的环境中安装。 前提条件 如果您不知道如何安装 Docker 环境,请参考 Docker 安装 (../guide/installation/docker)。 Docker 环境. 请保证您已经安装了 Docker 以及 Docker Compose。; 许可证....","head":[["link",{"rel":"alternate","hreflang":"en-us","href":"https://docs.crawlab.cn/en/pro/installation.html"}],["meta",{"property":"og:url","content":"https://docs.crawlab.cn/zh/pro/installation.html"}],["meta",{"property":"og:title","content":"安装"}],["meta",{"property":"og:description","content":"Crawlab 专业版通过 Docker 镜像的方式提供,您可以在任何支持 Docker 的环境中安装。 前提条件 如果您不知道如何安装 Docker 环境,请参考 Docker 安装 (../guide/installation/docker)。 Docker 环境. 请保证您已经安装了 Docker 以及 Docker Compose。; 许可证...."}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:locale","content":"zh-CN"}],["meta",{"property":"og:locale:alternate","content":"en-US"}],["meta",{"property":"og:updated_time","content":"2023-04-05T06:59:46.000Z"}],["meta",{"property":"article:author","content":"Marvin Zhang"}],["meta",{"property":"article:modified_time","content":"2023-04-05T06:59:46.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"安装\\",\\"image\\":[\\"\\"],\\"dateModified\\":\\"2023-04-05T06:59:46.000Z\\",\\"author\\":[{\\"@type\\":\\"Person\\",\\"name\\":\\"Marvin Zhang\\"}]}"]]},"headers":[{"level":2,"title":"前提条件","slug":"前提条件","link":"#前提条件","children":[]},{"level":2,"title":"主流程","slug":"主流程","link":"#主流程","children":[]},{"level":2,"title":"单节点部署","slug":"单节点部署","link":"#单节点部署","children":[]},{"level":2,"title":"多节点部署","slug":"多节点部署","link":"#多节点部署","children":[{"level":3,"title":"搭建主节点","slug":"搭建主节点","link":"#搭建主节点","children":[]},{"level":3,"title":"搭建工作节点","slug":"搭建工作节点","link":"#搭建工作节点","children":[]}]},{"level":2,"title":"外部 MongoDB","slug":"外部-mongodb","link":"#外部-mongodb","children":[]}],"git":{"createdTime":1665544716000,"updatedTime":1680677986000,"contributors":[{"name":"Marvin Zhang","email":"tikazyq@163.com","commits":5}]},"readingTime":{"minutes":5.04,"words":1513},"filePathRelative":"zh/pro/installation.md","localizedDate":"2022年10月12日","autoDesc":true}');export{e as data}; diff --git a/docs/.vuepress/dist/assets/installation.html-33e8c9f8.js b/docs/.vuepress/dist/assets/installation.html-dc29e5fa.js similarity index 79% rename from docs/.vuepress/dist/assets/installation.html-33e8c9f8.js rename to docs/.vuepress/dist/assets/installation.html-dc29e5fa.js index 691ba32..d334ae9 100644 --- a/docs/.vuepress/dist/assets/installation.html-33e8c9f8.js +++ b/docs/.vuepress/dist/assets/installation.html-dc29e5fa.js @@ -1,4 +1,4 @@ -import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framework-64cb0dab.js";const u={},r=e('

安装

Crawlab 专业版通过 Docker 镜像的方式提供,您可以在任何支持 Docker 的环境中安装。

前提条件

提示

如果您不知道如何安装 Docker 环境,请参考 Docker 安装

主流程

提示

如果您不知道如何获取许可证,请参考 许可证

安装 Crawlab 专业版的主流程跟 Crawlab 开源版 的流程是一样的,只是在安装过程中需要提供许可证 。您可以参考 用 Docker 安装开源版 的安装流程。

',8),k={href:"https://www.docker.com/",target:"_blank",rel:"noopener noreferrer"},d={href:"https://docs.docker.com/compose/",target:"_blank",rel:"noopener noreferrer"},m=n("li",null,"拉取 Crawlab Docker 镜像(如果没有 MongoDB,也需要拉取)",-1),v=n("li",null,[s("创建 "),n("code",null,"docker-compose.yml"),s(" 并进行配置(需要包含 "),n("a",{href:"./license"},"许可证"),s(")")],-1),b=n("li",null,"启动 Docker 容器",-1),g=n("div",{class:"hint-container warning"},[n("p",{class:"hint-container-title"},"注意"),n("p",null,"在接下来的指南中,我们假设您已经安装了 Docker 和 Docker-Compose,并已经拉取了相应的 Docker 镜像。")],-1),_=n("h2",{id:"单节点部署",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#单节点部署","aria-hidden":"true"},"#"),s(" 单节点部署")],-1),y=e(`

单节点部署(SND)快速开始 中的配置类似,它通常用作演示或少量爬虫管理。在 SND 中,所有 Docker 容器 ( 包括 Crawlab 和 MongoDB) 都在单独一台机器上,即主节点(如上图)。

创建 docker-compose.yml 并输入如下内容。

version: '3.3'
+import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framework-64cb0dab.js";const r={},u=e('

安装

Crawlab 专业版通过 Docker 镜像的方式提供,您可以在任何支持 Docker 的环境中安装。

前提条件

提示

如果您不知道如何安装 Docker 环境,请参考 Docker 安装

  • Docker 环境. 请保证您已经安装了 Docker 以及 Docker Compose。
  • 许可证. 您需要一个许可证才能使用 Crawlab 专业版。如果您还没有许可证,请联系我们来进行获取。

主流程

提示

如果您不知道如何获取许可证,请参考 许可证

安装 Crawlab 专业版的主流程跟 Crawlab 开源版 的流程是一样的,只是在安装过程中需要提供许可证 。您可以参考 用 Docker 安装开源版 的安装流程。

',8),k={href:"https://www.docker.com/",target:"_blank",rel:"noopener noreferrer"},d={href:"https://docs.docker.com/compose/",target:"_blank",rel:"noopener noreferrer"},m=n("li",null,"拉取 Crawlab Docker 镜像(如果没有 MongoDB,也需要拉取)",-1),v=n("li",null,[s("创建 "),n("code",null,"docker-compose.yml"),s(" 并进行配置(需要包含 "),n("a",{href:"./license"},"许可证"),s(")")],-1),b=n("li",null,"启动 Docker 容器",-1),g=n("div",{class:"hint-container warning"},[n("p",{class:"hint-container-title"},"注意"),n("p",null,"在接下来的指南中,我们假设您已经安装了 Docker 和 Docker-Compose,并已经拉取了相应的 Docker 镜像。")],-1),_=n("h2",{id:"单节点部署",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#单节点部署","aria-hidden":"true"},"#"),s(" 单节点部署")],-1),y=e(`

单节点部署(SND)快速开始 中的配置类似,它通常用作演示或少量爬虫管理。在 SND 中,所有 Docker 容器 ( 包括 Crawlab 和 MongoDB) 都在单独一台机器上,即主节点(如上图)。

创建 docker-compose.yml 并输入如下内容。

version: '3.3'
 services:
   master:
     image: crawlabteam/crawlab-pro
@@ -16,6 +16,7 @@ import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
     volumes:
       - "/opt/crawlab/.crawlab/master:/root/.crawlab"  # 持久化 crawlab 元数据
       - "/opt/crawlab/master:/data"  # 持久化 crawlab 数据
+      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
     ports:
       - "8080:8080"  # 开放 api 端口
     depends_on:
@@ -31,7 +32,7 @@ import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
       - "/opt/crawlab/mongo/data/db:/data/db"  # 持久化 mongo 数据
     ports:
       - "27017:27017"  # 开放 mongo 端口到宿主机
-

执行 docker-compose up -d 并在浏览器中导航至 http://<your_ip>:8080,然后开始使用 Crawlab。

多节点部署

`,5),q=e(`

多节点部署(MND) 通常用在由主节点和工作节点组成的生产环境。主节点于工作节点连接,并在集群中起中控的作用。

多节点部署(MND)配置比单节点部署(SND)要复杂一些,但您可以参考下面的教程来创建小型集群,整个过程非常直观。

搭建主节点

主节点 中创建 docker-compose.yml,并输入如下内容。 然后执行 docker-compose up -d 以启动容器。

# 主节点
+

执行 docker-compose up -d 并在浏览器中导航至 http://<your_ip>:8080,然后开始使用 Crawlab。

多节点部署

`,5),q=e(`

多节点部署(MND) 通常用在由主节点和工作节点组成的生产环境。主节点于工作节点连接,并在集群中起中控的作用。

多节点部署(MND)配置比单节点部署(SND)要复杂一些,但您可以参考下面的教程来创建小型集群,整个过程非常直观。

搭建主节点

主节点 中创建 docker-compose.yml,并输入如下内容。 然后执行 docker-compose up -d 以启动容器。

# 主节点
 version: '3.3'
 services:
   master:
@@ -50,6 +51,7 @@ import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
     volumes:
       - "/opt/crawlab/.crawlab/master:/root/.crawlab"  # 持久化 crawlab 元数据
       - "/opt/crawlab/master:/data"  # 持久化 crawlab 数据
+      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
     ports:
       - "8080:8080"  # 开放 api 端口
       - "9666:9666"  # 开放 grpc 端口
@@ -66,7 +68,7 @@ import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
       - "/opt/crawlab/mongo/data/db:/data/db"  # 持久化 mongo 数据
     ports:
       - "27017:27017"  # 开放 mongo 端口到宿主机
-

搭建工作节点

在每个 工作节点 中创建 docker-compose.yml,并输入如下内容。 然后执行 docker-compose up -d 以启动容器。

# 工作节点
+

搭建工作节点

在每个 工作节点 中创建 docker-compose.yml,并输入如下内容。 然后执行 docker-compose up -d 以启动容器。

# 工作节点
 version: '3.3'
 services:
   worker:
@@ -81,7 +83,8 @@ import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
     volumes:
       - "/opt/crawlab/.crawlab/worker:/root/.crawlab"  # 持久化 crawlab 元数据
       - "/opt/crawlab/worker:/data"  # 持久化 crawlab 数据
-

请注意您需要将 <master_node_ip> 替换为主节点 IP 地址,并保证其能被工作节点访问。

主节点和工作节点都启动之后,您可以导航至 http://<master_node_ip>:8080 并开始使用 Crawlab.

注意

开放主节点端口

由于工作节点是通过端口 8080 (API) 以及 9666 (gRPC) 来连接主节点的,您需要保证它们都是处于开放状态,没有 被主节点防火墙所禁用。

外部 MongoDB

在之前介绍的多节点部署(MND)中,您可能已经注意到 MongoDB 默认是部署在主节点上的。但出于性能考虑,这样的顺手部署配置将导致问题,因为 MongoDB 本身可能会成为瓶颈,尤其是在大规模分布式系统中。

所幸的是,这个问题能够通过部署外部 MongoDB 到其他节点或云数据库服务供应商(例如 AWS、Azure、Aliyun)来解决。通过这个方式,MongoDB 能够轻松的扩容,因此数据库的稳定性能够得到有效保证。请参考下图。

`,14),h=e(`

主节点配置文件 \`docker-compose.与 默认多节点部署(MND) 稍微有些不同。请参考下面内容。

# 外部 MongoDB 的主节点
+      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
+

请注意您需要将 <master_node_ip> 替换为主节点 IP 地址,并保证其能被工作节点访问。

主节点和工作节点都启动之后,您可以导航至 http://<master_node_ip>:8080 并开始使用 Crawlab.

注意

开放主节点端口

由于工作节点是通过端口 8080 (API) 以及 9666 (gRPC) 来连接主节点的,您需要保证它们都是处于开放状态,没有 被主节点防火墙所禁用。

外部 MongoDB

在之前介绍的多节点部署(MND)中,您可能已经注意到 MongoDB 默认是部署在主节点上的。但出于性能考虑,这样的顺手部署配置将导致问题,因为 MongoDB 本身可能会成为瓶颈,尤其是在大规模分布式系统中。

所幸的是,这个问题能够通过部署外部 MongoDB 到其他节点或云数据库服务供应商(例如 AWS、Azure、Aliyun)来解决。通过这个方式,MongoDB 能够轻松的扩容,因此数据库的稳定性能够得到有效保证。请参考下图。

`,14),h=e(`

主节点配置文件 \`docker-compose.与 默认多节点部署(MND) 稍微有些不同。请参考下面内容。

# 外部 MongoDB 的主节点
 version: '3.3'
 services:
   master:
@@ -103,7 +106,8 @@ import{_ as l,E as p,Z as c,$ as i,a0 as n,a3 as s,a1 as a,a4 as e}from"./framew
     volumes:
       - "/opt/crawlab/.crawlab/master:/root/.crawlab"  # 持久化 crawlab 元数据
       - "/opt/crawlab/master:/data"  # 持久化 crawlab 数据
+      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
     ports:
       - "8080:8080"  # 开放 api 端口
       - "9666:9666"  # 开放 grpc 端口
-

可以看到,服务 mongo 被移除了,MongoDB 连接相关的环境变量 (例如 CRAWLAB_MONGO_HOST, CRAWLAB_MONGO_PORT) 指向了外部 MongoDB。您可以将其中一些不需要设置的环境变量留空。

`,3);function A(O,w){const o=p("ExternalLinkIcon"),t=p("Mermaid");return c(),i("div",null,[r,n("ol",null,[n("li",null,[s("安装 "),n("a",k,[s("Docker"),a(o)]),s(" 和 "),n("a",d,[s("Docker-Compose"),a(o)])]),m,v,b]),g,_,a(t,{id:"mermaid-67",code:"eJxLy8kvT85ILCpRCHHiUlAoLk1KL0osyFBQeto79UVX0/OmnS+bVzzfu+n9no5gP5f3ezqVgKoUFHKjn+zYDZGPVbDR1bVTyE2P9s3PS893cYoFqkjNSwGZVlKZk6qQq5CWmZNjpWySZGGcZKSTnJ+TX2RVnpFZkopQkg5VY2aebGSciKIGAHDOPmE="}),y,a(t,{id:"mermaid-81",code:"eJxLy8kvT85ILCpR8AniUlAoLk1KL0osyFBQerpk1ouupudNO182r3i+d9P7PR2+fi7v93QqAVUpKORGP9mxGyIfq2Cjq2unUG4Y/XT70id750BEFQxjIQqhskaoskaossaossYg2dS8FJCDSipzUoHq0jJzcqyUTZIsjJOMdJLzc/KLrMozMktS4UrKDaFqUs0SjYyTsasxIkKNMW41AKN/dP8="}),q,a(t,{id:"mermaid-125",code:"eJxLy8kvT85ILCpR8AniUlAoLk1KL0osyFBQerpk2svmFQq++Xnp+S5OCs9ntTxdMutFV9Pzpp1A8ed7N73f0+Hr5/J+T6cSUJ+CQkpmUWpySWZ+nkKIE1ggN/rJjt0QDbEKNrq6dgrlhtFPty99sncORFTBMBaiECprhCprhCprjCprDJVNj4a6EGpHLlA4NS8F5JWSypxUoPa0zJwcK2WTJAvjJCOd5Pyc/CKr8ozMklS4knJDqJpUs0Qj42TsaoyIUGNMWE1uOlSNmXmykXEiihoA+hSU6g=="}),h])}const C=l(u,[["render",A],["__file","installation.html.vue"]]);export{C as default}; +

可以看到,服务 mongo 被移除了,MongoDB 连接相关的环境变量 (例如 CRAWLAB_MONGO_HOST, CRAWLAB_MONGO_PORT) 指向了外部 MongoDB。您可以将其中一些不需要设置的环境变量留空。

`,3);function A(O,w){const o=p("ExternalLinkIcon"),t=p("Mermaid");return c(),i("div",null,[u,n("ol",null,[n("li",null,[s("安装 "),n("a",k,[s("Docker"),a(o)]),s(" 和 "),n("a",d,[s("Docker-Compose"),a(o)])]),m,v,b]),g,_,a(t,{id:"mermaid-67",code:"eJxLy8kvT85ILCpRCHHiUlAoLk1KL0osyFBQeto79UVX0/OmnS+bVzzfu+n9no5gP5f3ezqVgKoUFHKjn+zYDZGPVbDR1bVTyE2P9s3PS893cYoFqkjNSwGZVlKZk6qQq5CWmZNjpWySZGGcZKSTnJ+TX2RVnpFZkopQkg5VY2aebGSciKIGAHDOPmE="}),y,a(t,{id:"mermaid-81",code:"eJxLy8kvT85ILCpR8AniUlAoLk1KL0osyFBQerpk1ouupudNO182r3i+d9P7PR2+fi7v93QqAVUpKORGP9mxGyIfq2Cjq2unUG4Y/XT70id750BEFQxjIQqhskaoskaossaossYg2dS8FJCDSipzUoHq0jJzcqyUTZIsjJOMdJLzc/KLrMozMktS4UrKDaFqUs0SjYyTsasxIkKNMW41AKN/dP8="}),q,a(t,{id:"mermaid-125",code:"eJxLy8kvT85ILCpR8AniUlAoLk1KL0osyFBQerpk2svmFQq++Xnp+S5OCs9ntTxdMutFV9Pzpp1A8ed7N73f0+Hr5/J+T6cSUJ+CQkpmUWpySWZ+nkKIE1ggN/rJjt0QDbEKNrq6dgrlhtFPty99sncORFTBMBaiECprhCprhCprjCprDJVNj4a6EGpHLlA4NS8F5JWSypxUoPa0zJwcK2WTJAvjJCOd5Pyc/CKr8ozMklS4knJDqJpUs0Qj42TsaoyIUGNMWE1uOlSNmXmykXEiihoA+hSU6g=="}),h])}const C=l(r,[["render",A],["__file","installation.html.vue"]]);export{C as default}; diff --git a/docs/.vuepress/dist/assets/selenium.html-1ec9eacd.js b/docs/.vuepress/dist/assets/selenium.html-1ec9eacd.js new file mode 100644 index 0000000..0273467 --- /dev/null +++ b/docs/.vuepress/dist/assets/selenium.html-1ec9eacd.js @@ -0,0 +1,43 @@ +import{_ as e,E as t,Z as i,$ as p,a0 as n,a3 as s,a1 as o,a4 as c}from"./framework-64cb0dab.js";const l={},r=n("h1",{id:"selenium-spider-integration",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#selenium-spider-integration","aria-hidden":"true"},"#"),s(" Selenium Spider Integration")],-1),u=n("h2",{id:"introduction-to-selenium",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#introduction-to-selenium","aria-hidden":"true"},"#"),s(" Introduction to Selenium")],-1),d={href:"https://selenium-python.readthedocs.io/index.html",target:"_blank",rel:"noopener noreferrer"},k=c(`

Integrating Selenium Spider in Crawlab

Below, we will explain how to integrate a Selenium spider into Crawlab and display the scraping results in the Crawlab frontend interface. We will demonstrate the process using the example of scraping 36kr (36氪) website.

Creating the Spider

In the Crawlab spider list, create a spider named "36kr" with the execution command python main.py.

Editing the Spider File

Create and open the main.py file and enter the following content:

from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from crawlab import save_item
+
+# create web driver with chrome
+chrome_options = Options()
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+browser = webdriver.Chrome(options=chrome_options)
+
+# navigate to the news list page
+browser.get('https://36kr.com/information/web_news/')
+
+# get article items
+items = browser.find_elements(by=By.CSS_SELECTOR, value='.information-flow-list > .information-flow-item')
+
+# iterate through items
+for item in items:
+    # extract fields
+    el_title = item.find_element(by=By.CSS_SELECTOR, value='.article-item-title')
+    title = el_title.text
+    url = el_title.get_attribute('href')
+    topic = item.find_element(by=By.CSS_SELECTOR, value='.kr-flow-bar-motif > a').text
+    description = item.find_element(by=By.CSS_SELECTOR, value='.article-item-description').text
+    try:
+        pic_url = item.find_element(by=By.CSS_SELECTOR, value='.article-item-pic > img').get_attribute('src')
+    except:
+        pic_url = None
+
+    # save to Crawlab
+    save_item({
+        'title': title,
+        'url': url,
+        'topic': topic,
+        'description': description,
+        'pic_url': pic_url,
+    })
+

In this code, we define the chrome_options for the Chrome browser and include the following important parameters:

Note

These parameters are crucial; otherwise, the script may not run correctly in Crawlab!

chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+

Finally, we use the save_item method from the Crawlab SDK to save the scraping results obtained by the web scraper.

Running the Spider

Run the "36kr" spider in Crawlab to obtain the scraping results.

`,13);function m(h,v){const a=t("ExternalLinkIcon");return i(),p("div",null,[r,u,n("p",null,[n("a",d,[s("Selenium"),o(a)]),s(" is a tool primarily used for web application testing, but it can also be used to write web scrapers. Unlike traditional HTTP request libraries (such as Requests), Selenium allows you to simulate browser behavior and automate the browser to gather data. This is particularly useful for scraping dynamic web pages that require JavaScript rendering.")]),k])}const g=e(l,[["render",m],["__file","selenium.html.vue"]]);export{g as default}; diff --git a/docs/.vuepress/dist/assets/selenium.html-bb5bf820.js b/docs/.vuepress/dist/assets/selenium.html-bb5bf820.js new file mode 100644 index 0000000..af2959a --- /dev/null +++ b/docs/.vuepress/dist/assets/selenium.html-bb5bf820.js @@ -0,0 +1 @@ +const e=JSON.parse('{"key":"v-1e0946ec","path":"/zh/guide/spider/selenium.html","title":"Selenium 爬虫集成","lang":"zh-CN","frontmatter":{"description":"Selenium 简介 Selenium (https://selenium-python.readthedocs.io/index.html) 是一个用于 Web 应用程序测试的工具,但也可以用于编写 Web 爬虫。与传统的 HTTP 请求库(如 Requests)不同,Selenium 允许您模拟浏览器行为,实际上自动化浏览器来获取数据。这对于那些...","head":[["link",{"rel":"alternate","hreflang":"en-us","href":"https://docs.crawlab.cn/en/guide/spider/selenium.html"}],["meta",{"property":"og:url","content":"https://docs.crawlab.cn/zh/guide/spider/selenium.html"}],["meta",{"property":"og:title","content":"Selenium 爬虫集成"}],["meta",{"property":"og:description","content":"Selenium 简介 Selenium (https://selenium-python.readthedocs.io/index.html) 是一个用于 Web 应用程序测试的工具,但也可以用于编写 Web 爬虫。与传统的 HTTP 请求库(如 Requests)不同,Selenium 允许您模拟浏览器行为,实际上自动化浏览器来获取数据。这对于那些..."}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:locale","content":"zh-CN"}],["meta",{"property":"og:locale:alternate","content":"en-US"}],["meta",{"property":"og:updated_time","content":"2023-07-23T06:58:17.000Z"}],["meta",{"property":"article:author","content":"Marvin Zhang"}],["meta",{"property":"article:modified_time","content":"2023-07-23T06:58:17.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"Selenium 爬虫集成\\",\\"image\\":[\\"\\"],\\"dateModified\\":\\"2023-07-23T06:58:17.000Z\\",\\"author\\":[{\\"@type\\":\\"Person\\",\\"name\\":\\"Marvin Zhang\\"}]}"]]},"headers":[{"level":2,"title":"Selenium 简介","slug":"selenium-简介","link":"#selenium-简介","children":[]},{"level":2,"title":"在 Crawlab 中集成 Selenium 爬虫","slug":"在-crawlab-中集成-selenium-爬虫","link":"#在-crawlab-中集成-selenium-爬虫","children":[{"level":3,"title":"创建爬虫","slug":"创建爬虫","link":"#创建爬虫","children":[]},{"level":3,"title":"编辑爬虫文件","slug":"编辑爬虫文件","link":"#编辑爬虫文件","children":[]},{"level":3,"title":"运行爬虫","slug":"运行爬虫","link":"#运行爬虫","children":[]}]}],"git":{"createdTime":1690095497000,"updatedTime":1690095497000,"contributors":[{"name":"Marvin Zhang","email":"tikazyq@163.com","commits":1}]},"readingTime":{"minutes":1.39,"words":417},"filePathRelative":"zh/guide/spider/selenium.md","localizedDate":"2023年7月23日","autoDesc":true}');export{e as data}; diff --git a/docs/.vuepress/dist/assets/selenium.html-cb00ddf0.js b/docs/.vuepress/dist/assets/selenium.html-cb00ddf0.js new file mode 100644 index 0000000..e5cbf92 --- /dev/null +++ b/docs/.vuepress/dist/assets/selenium.html-cb00ddf0.js @@ -0,0 +1,43 @@ +import{_ as e,E as t,Z as p,$ as o,a0 as n,a3 as s,a1 as i,a4 as c}from"./framework-64cb0dab.js";const l={},u=n("h1",{id:"selenium-爬虫集成",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#selenium-爬虫集成","aria-hidden":"true"},"#"),s(" Selenium 爬虫集成")],-1),r=n("h2",{id:"selenium-简介",tabindex:"-1"},[n("a",{class:"header-anchor",href:"#selenium-简介","aria-hidden":"true"},"#"),s(" Selenium 简介")],-1),d={href:"https://selenium-python.readthedocs.io/index.html",target:"_blank",rel:"noopener noreferrer"},k=c(`

在 Crawlab 中集成 Selenium 爬虫

下面,我们将介绍如何在 Crawlab 中集成 Selenium 爬虫,并在 Crawlab 前端界面中显示抓取结果。我们将以36氪为例进行演示。

创建爬虫

在 Crawlab 爬虫列表中,创建名为 "36kr" 的爬虫,执行命令为 python main.py

编辑爬虫文件

创建并打开 main.py,输入以下内容。

from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from crawlab import save_item
+
+# create web driver with chrome
+chrome_options = Options()
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+browser = webdriver.Chrome(options=chrome_options)
+
+# navigate to news list page
+browser.get('https://36kr.com/information/web_news/')
+
+# get article items
+items = browser.find_elements(by=By.CSS_SELECTOR, value='.information-flow-list > .information-flow-item')
+
+# iterate items
+for item in items:
+    # fields
+    el_title = item.find_element(by=By.CSS_SELECTOR, value='.article-item-title')
+    title = el_title.text
+    url = el_title.get_attribute('href')
+    topic = item.find_element(by=By.CSS_SELECTOR, value='.kr-flow-bar-motif > a').text
+    description = item.find_element(by=By.CSS_SELECTOR, value='.article-item-description').text
+    try:
+        pic_url = item.find_element(by=By.CSS_SELECTOR, value='.article-item-pic > img').get_attribute('src')
+    except:
+        pic_url = None
+
+    # save to crawlab
+    save_item({
+        'title': title,
+        'url': url,
+        'topic': topic,
+        'description': description,
+        'pic_url': pic_url,
+    })
+

其中,我们定义了 Chrome 浏览器的 chrome_options,并需要加入以下参数。

注意

这很重要,否则在 Crawlab 中将无法正常运行!

chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+

最后,我们用 Crawlab SDK 中的 save_item 方法保存爬虫抓取获得的结果。

运行爬虫

在 "36kr" 爬虫中运行爬虫,即可以获取得到抓取结果。

`,13);function m(v,b){const a=t("ExternalLinkIcon");return p(),o("div",null,[u,r,n("p",null,[n("a",d,[s("Selenium"),i(a)]),s(" 是一个用于 Web 应用程序测试的工具,但也可以用于编写 Web 爬虫。与传统的 HTTP 请求库(如 Requests)不同,Selenium 允许您模拟浏览器行为,实际上自动化浏览器来获取数据。这对于那些需要 JavaScript 渲染的动态网页非常有用。")]),k])}const _=e(l,[["render",m],["__file","selenium.html.vue"]]);export{_ as default}; diff --git a/docs/.vuepress/dist/assets/selenium.html-e5e9efb9.js b/docs/.vuepress/dist/assets/selenium.html-e5e9efb9.js new file mode 100644 index 0000000..e47db54 --- /dev/null +++ b/docs/.vuepress/dist/assets/selenium.html-e5e9efb9.js @@ -0,0 +1 @@ +const e=JSON.parse('{"key":"v-5f61b491","path":"/en/guide/spider/selenium.html","title":"Selenium Spider Integration","lang":"en-US","frontmatter":{"description":"Introduction to Selenium Selenium (https://selenium-python.readthedocs.io/index.html) is a tool primarily used for web application testing, but it can also be used to write web ...","head":[["link",{"rel":"alternate","hreflang":"zh-cn","href":"https://docs.crawlab.cn/zh/guide/spider/selenium.html"}],["meta",{"property":"og:url","content":"https://docs.crawlab.cn/en/guide/spider/selenium.html"}],["meta",{"property":"og:title","content":"Selenium Spider Integration"}],["meta",{"property":"og:description","content":"Introduction to Selenium Selenium (https://selenium-python.readthedocs.io/index.html) is a tool primarily used for web application testing, but it can also be used to write web ..."}],["meta",{"property":"og:type","content":"article"}],["meta",{"property":"og:locale","content":"en-US"}],["meta",{"property":"og:locale:alternate","content":"zh-CN"}],["meta",{"property":"og:updated_time","content":"2023-07-23T06:58:17.000Z"}],["meta",{"property":"article:author","content":"Marvin Zhang"}],["meta",{"property":"article:modified_time","content":"2023-07-23T06:58:17.000Z"}],["script",{"type":"application/ld+json"},"{\\"@context\\":\\"https://schema.org\\",\\"@type\\":\\"Article\\",\\"headline\\":\\"Selenium Spider Integration\\",\\"image\\":[\\"\\"],\\"dateModified\\":\\"2023-07-23T06:58:17.000Z\\",\\"author\\":[{\\"@type\\":\\"Person\\",\\"name\\":\\"Marvin Zhang\\"}]}"]]},"headers":[{"level":2,"title":"Introduction to Selenium","slug":"introduction-to-selenium","link":"#introduction-to-selenium","children":[]},{"level":2,"title":"Integrating Selenium Spider in Crawlab","slug":"integrating-selenium-spider-in-crawlab","link":"#integrating-selenium-spider-in-crawlab","children":[{"level":3,"title":"Creating the Spider","slug":"creating-the-spider","link":"#creating-the-spider","children":[]},{"level":3,"title":"Editing the Spider File","slug":"editing-the-spider-file","link":"#editing-the-spider-file","children":[]},{"level":3,"title":"Running the Spider","slug":"running-the-spider","link":"#running-the-spider","children":[]}]}],"git":{"createdTime":1690095497000,"updatedTime":1690095497000,"contributors":[{"name":"Marvin Zhang","email":"tikazyq@163.com","commits":1}]},"readingTime":{"minutes":1.16,"words":349},"filePathRelative":"en/guide/spider/selenium.md","localizedDate":"July 23, 2023","autoDesc":true}');export{e as data}; diff --git a/docs/.vuepress/dist/en/api/api-reference.html b/docs/.vuepress/dist/en/api/api-reference.html index f005dcf..034a64b 100644 --- a/docs/.vuepress/dist/en/api/api-reference.html +++ b/docs/.vuepress/dist/en/api/api-reference.html @@ -38,7 +38,7 @@ } - +


Last update:
- + diff --git a/docs/.vuepress/dist/en/api/index.html b/docs/.vuepress/dist/en/api/index.html index e02b29f..7e97d39 100644 --- a/docs/.vuepress/dist/en/api/index.html +++ b/docs/.vuepress/dist/en/api/index.html @@ -38,7 +38,7 @@ } - +

Introduction


Introduction

Crawlab users and developers are allowed integrate their own data into the Crawlab platform. This can be achieved by providing open APIs for data integration.

Please refer to below for more information.

- + diff --git a/docs/.vuepress/dist/en/develop/index.html b/docs/.vuepress/dist/en/develop/index.html index 7ae3c81..0c7fd86 100644 --- a/docs/.vuepress/dist/en/develop/index.html +++ b/docs/.vuepress/dist/en/develop/index.html @@ -38,7 +38,7 @@ } - +


- + diff --git a/docs/.vuepress/dist/en/develop/introduction.html b/docs/.vuepress/dist/en/develop/introduction.html index f5978f9..71d2907 100644 --- a/docs/.vuepress/dist/en/develop/introduction.html +++ b/docs/.vuepress/dist/en/develop/introduction.html @@ -38,7 +38,7 @@ } - +

Introduction


Introduction

🚧 Under construction

- + diff --git a/docs/.vuepress/dist/en/develop/plugins/index.html b/docs/.vuepress/dist/en/develop/plugins/index.html index 1e4683e..84641d0 100644 --- a/docs/.vuepress/dist/en/develop/plugins/index.html +++ b/docs/.vuepress/dist/en/develop/plugins/index.html @@ -38,7 +38,7 @@ } - +

Develop Plugins


Develop Plugins

🚧 Under construction

- + diff --git a/docs/.vuepress/dist/en/faq/index.html b/docs/.vuepress/dist/en/faq/index.html new file mode 100644 index 0000000..d8f2bc2 --- /dev/null +++ b/docs/.vuepress/dist/en/faq/index.html @@ -0,0 +1,57 @@ + + + + + + + + FAQ + + + + + + +

FAQ


FAQ

What is Crawlab?

Crawlab is an open-source web crawler management platform. Its design goal is to help users easily create, manage, and monitor web crawler tasks. Crawlab provides a user-friendly graphical interface that allows users to configure crawler tasks, set crawling rules, monitor the crawling status, and view the crawling results.

You can check the Introduction section for more information.

Why can Crawlab execute crawlers written in different programming languages and frameworks?

Crawlab executes crawler tasks based on Shell commands. Therefore, theoretically, any crawler that can be run using Shell commands can be executed in Crawlab if the environment allows.

The Execution Command and Parameters in the crawler are concatenated to form the actual Shell command for the crawler task. For example, if the Execute Command of a certain crawler is python main.py and the parameter is spider1, then the Shell command for the crawler task will be python main.py spider1.

Why does Crawlab always pull version v0.6.0 instead of the latest version?

For users in China, it is highly possible that you have configured the Aliyun mirror proxy. Please use other mirror proxies, such as Tencent Cloud mirror proxyopen in new window.

Does Crawlab support Scrapy?

Yes, Crawlab supports Scrapy, and it has a built-in pipeline that can be used. You just need to add crawlab.CrawlabPipeline to the ITEM_PIPELINS in the settings.py file to integrate it.

For more details, please refer to Spider Integration.

Does Crawlab support Selenium?

Yes, Crawlab supports Selenium for web scraping. For more details, please refer to Selenium Spider Integration.

+ + + diff --git a/docs/.vuepress/dist/en/guide/basic-tutorial/index.html b/docs/.vuepress/dist/en/guide/basic-tutorial/index.html index 2fda268..d9fd0ad 100644 --- a/docs/.vuepress/dist/en/guide/basic-tutorial/index.html +++ b/docs/.vuepress/dist/en/guide/basic-tutorial/index.html @@ -38,7 +38,7 @@ } - +

Quick Tutorial


Quick Tutorial

You have now installed Crawlab and perhaps can't wait to start using it. Before you go deep into the details, I would suggest you go through this quick tutorial which would walk you through some basics and get you familiar with some main features in Crawlab.

Introduction

In this tutorial, we are going to create a spider that crawls quotes on a mock siteopen in new window provided by Zyteopen in new window (the company behind Scrapy); then we will upload this spider to Crawlab, and run it to extract quotes data; finally, we will view the crawled data visually on Crawlab.

The framework we are going to use is Scrapyopen in new window, the most popular web crawler framework written in Python, which is easy to use yet with many powerful features.

Note

We assume you have installed Crawlab on your local machine by following Quick Start. If you haven't, please refer to Quick Start to install it on your local machine.

As we are using Scrapy, please make sure you have installed Pythonopen in new window (>=3.6) and module management tool pipopen in new window before proceeding any further steps.

Create Spider

First thing first, we are going to generate a Scrapy project. Let's start by installing Scrapy.

pip install scrapy
@@ -90,6 +90,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/cli/index.html b/docs/.vuepress/dist/en/guide/cli/index.html index 14af45e..736d960 100644 --- a/docs/.vuepress/dist/en/guide/cli/index.html +++ b/docs/.vuepress/dist/en/guide/cli/index.html @@ -38,7 +38,7 @@ } - +

CLI


CLI

The CLI tools allow users to easily manage Crawlab and perform common actions including uploading spiders. It is written in Python and very easy to install.

Install

Crawlab CLI tools is integrated with Crawlab SDKopen in new window. You can install Crawlab CLI tools by executing the command below.

pip install crawlab-sdk
@@ -68,6 +68,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/data-sources/index.html b/docs/.vuepress/dist/en/guide/data-sources/index.html index 5a44db3..b63601d 100644 --- a/docs/.vuepress/dist/en/guide/data-sources/index.html +++ b/docs/.vuepress/dist/en/guide/data-sources/index.html @@ -38,7 +38,7 @@ } - +

Data Sources


Data Sources

Crawlab supports data sources integration, which means you can use Crawlab to manage your data sources, such as MongoDB, MySQL, PostgreSQL, SQL Server, etc.

Supported Data Sources

CategoryData SourceSupported
Non-RelationalMongoDBopen in new window
Non-RelationalElasticSearchopen in new window
RelationalMySQLopen in new window
RelationalPostgreSQLopen in new window
RelationalSQL Serveropen in new window
RelationalCockroachDBopen in new window
RelationalSqliteopen in new window
StreamingKafkaopen in new window

Add Data Source

  1. Go to the Data Sources page
    data-sources-menu
  2. Click New Data Source button
    new-data-source-button
  3. Select Type as the data source type, and enter Name and connection fields
    mongo-form
  4. Click Confirm button to save the data source

Now you should be able to see the data source in the Data Sources page.

Use Data Source

  1. Go to the Spider Detail page
  2. Select the data source in the Data Source field
    mongo-data-source
  3. Click on Save button to save the spider
  4. Add related integration code in the code where saving results data (refer to the Spider Code Examples section below)
  5. Run the spider, and you should see the results in the Data tab
    results

Spider Code Examples

General Python Spider

The method save_item in crawlab-sdkopen in new window can be used to save data to designated data source.


@@ -62,6 +62,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/deps/index.html b/docs/.vuepress/dist/en/guide/deps/index.html index 0fa46e2..7ce1fba 100644 --- a/docs/.vuepress/dist/en/guide/deps/index.html +++ b/docs/.vuepress/dist/en/guide/deps/index.html @@ -38,7 +38,7 @@ } - +

Dependencies Management


Dependencies Management

Crawlab allows users to install and management dependencies for spiders and tasks.

Expand Dependencies button on the left sidebar and click sub-menu items as below.

menu.png

  • Settings: Global dependencies settings
  • Python: Python dependencies management
  • Node.js: Node.js dependencies management

Install Dependencies

  1. Navigate to the dependencies management page (Python/Node.js)
    deps-list.png
  2. Click on Installable button
    installable.png
  3. Enter search keywords and click on Search button
    img.png
  4. Click on Install button
    install.png
  5. Select Mode (which nodes to install) and Upgrade (whether to upgrade) and click Confirm button
    install-form.png

Uninstall Dependencies

  1. Navigate to the dependencies management page (Python/Node.js)
    deps-list.png
  2. Click on Uninstall button to uninstall the dependency
    uninstall.png
  3. Select Mode (which nodes to install) and click Confirm button
    uninstall-form.png

Settings

  1. Navigate to the dependencies management page (Settings)
    settings-list.png
  2. Click on Configure button
    edit.png
  3. Edit the configuration and click on Confirm button
    settings.png

Settings description:

  • Command: executable command for installing/uninstalling dependencies. E.g. pip, /usr/local/bin/pip39, npm , yarn
  • Proxy: proxy for installing/uninstalling dependencies. E.g. https://registry.npm.taobao.org or https://pypi.tuna.tsinghua.edu.cn/simple

Tasks

  1. Navigate to the dependencies management page (Python/Node.js)
  2. Click on Tasks button
    tasks.png
  3. You can view install/uninstall tasks
    tasks-list.png
  4. Click on Logs button to view logs
    tasks-logs.png
  5. You can view logs of given tasks
    tasks-logs-content.png
- + diff --git a/docs/.vuepress/dist/en/guide/environment/index.html b/docs/.vuepress/dist/en/guide/environment/index.html index 2360102..4858ba0 100644 --- a/docs/.vuepress/dist/en/guide/environment/index.html +++ b/docs/.vuepress/dist/en/guide/environment/index.html @@ -38,7 +38,7 @@ } - +

Environment Variables


Environment Variables

NOTE

This feature is only available in the Crawlab Pro Editionopen in new window.

Crawlab allows users to set environment variables during spider runtime.

Setting Environment Variables

  1. Navigate to the Environment Variables page.
  2. Click the Create Environment Variable button.
  3. Fill in the configuration form.

Accessing Environment Variables

Assuming we have set an environment variable with the key FOO and the value BAR, we can access it in a spider script using the following sample code.

import os
@@ -56,6 +56,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/index.html b/docs/.vuepress/dist/en/guide/index.html index 7a281c3..5f4df4f 100644 --- a/docs/.vuepress/dist/en/guide/index.html +++ b/docs/.vuepress/dist/en/guide/index.html @@ -38,7 +38,7 @@ } - +

Introduction


Introduction

If you already know what Crawlab is and what it is used for, you can head straight to Quick Start or Installation to install and start to use Crawlab.

If you are not familiar with Crawlab, you can read sections below in order to understand more about Crawlab.

What is Crawlab?

Crawlab is a powerful Web Crawler Management Platform (WCMP) that can run web crawlers and spiders developed in various programming languages including Python, Go, Node.js, Java, C# as well as frameworks including Scrapy, Colly, Selenium, Puppeteer. It is used for running, managing and monitoring web crawlers, particularly in production environment where traceability, scalability and stability are the major factors to concern.

Background and History

Crawlab project has been under continuous development since it was published in March 2019, and gone through a number of major releases. It was initially designed for solving the managerial issue when there are a large number of spiders to coordinate and execute. With a lot of improvements and newly updated features, Crawlab is becoming more and more popular in developer communities, particularly amongst web crawler engineers.

Change Logsopen in new window

Who can use Crawlab?

  • Web Crawler Engineers. By integrating web crawler programs into Crawlab, you can now focus only on the crawling and parsing logics, instead of wasting too much time on writing common modules such as task queue, storage, logging, notification, etc.
  • Operation Engineers. The main benefits from Crawlab for Operation Engineers are the convenience in deployment (for both crawler programs and Crawlab itself). Crawlab supports easy installation with Docker and Kubernetes.
  • Data Analysts. Data analysts who can code (e.g. Python) are able to develop web crawler programs (e.g. Scrapy) and upload them into Crawlab. Then leave all the rest dirty work to Crawlab, and it will automatically collect data for you.
  • Others. Technically everyone can enjoy the convenience and easiness of automation provided by Crawlab. Though Crawlab is good at running web crawler tasks, it can also be used for other types of tasks such as data processing and automation.

Main Features

CategoryFeatureDescription
NodeNode ManagementRegister, manage and control multiple nodes in the distributed system
SpiderSpider DeploymentAuto-deploy spiders to multiple nodes and auto-sync spider files including scripts and programs
Spider Code EditingUpdate and edit script code with the online editor on the go
Spider StatsSpider crawling statistical data such as average running time and results count
Framework IntegrationIntegrate spider frameworks such as Scrapy
Data Storage IntegrationAutomatic saving results data in the database without additional configurations
Git IntegrationVersion control through embedded or external remote Git repos
TaskTask SchedulingAssign and schedule crawling tasks to multiple nodes in the distributed system
Task LoggingAutomatic saving task logs which can be viewed in the frontend UI
Task StatsVisually display task stats including task results count and running time
UserUser ManagementCreate, update and delete user accounts
OtherDependency ManagementSearch and install dependencies Python and Node.js packages
NotificationAutomatic email or mobile notifications when tasks are triggered or complete
- + diff --git a/docs/.vuepress/dist/en/guide/installation/direct-deploy.html b/docs/.vuepress/dist/en/guide/installation/direct-deploy.html index 31e8dbd..fdb0455 100644 --- a/docs/.vuepress/dist/en/guide/installation/direct-deploy.html +++ b/docs/.vuepress/dist/en/guide/installation/direct-deploy.html @@ -38,7 +38,7 @@ } - +

Direct Deploy


Direct Deploy

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/guide/installation/docker.html b/docs/.vuepress/dist/en/guide/installation/docker.html index 0b76b1e..421c050 100644 --- a/docs/.vuepress/dist/en/guide/installation/docker.html +++ b/docs/.vuepress/dist/en/guide/installation/docker.html @@ -5,7 +5,7 @@ - - +

Installation: Docker


Installation: Docker

Docker is the most convenient and easiest way to install and deploy Crawlab. If you are not familiar with Docker, you can refer to Docker Official Siteopen in new window and install it on your local machine. Make sure you have installed Docker before proceeding any further steps.

Main Process

There are several deployment modes for Docker installation, but the main process is similar.

  1. Install Dockeropen in new window and Docker-Composeopen in new window
  2. Pull Docker image of Crawlab (and MongoDB if you have no external MongoDB instance)
  3. Create docker-compose.yml and make configurations
  4. Start Docker containers

Note

For following guidance, we will assume you have installed Docker and Docker-Compose, and already pulled Docker images.

Standalone-Node Deployment

Standalone-Node Deployment (SND) is similar to the configuration in Quick Start, and it is normally for demo purpose or managing a small number of crawlers. In SND, all Docker containers including Crawlab and MongoDB are in only a single machine, i.e. Master Node (see diagram above).

Create docker-compose.yml and enter the content below.

version: '3.3'
@@ -160,6 +160,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/installation/index.html b/docs/.vuepress/dist/en/guide/installation/index.html index e2cd532..7b2506d 100644 --- a/docs/.vuepress/dist/en/guide/installation/index.html +++ b/docs/.vuepress/dist/en/guide/installation/index.html @@ -38,7 +38,7 @@ } - +

Installation


Installation

There are multiple methods of installing Crawlab. You can refer to the summary table below to choose the one that is most suitable.

Install MethodRecommended EnvironmentRecommended Users
DockerDemo / Production (nodes<=10)
  1. Small cluster needed
  2. Familiar with Docker
  3. Minimal maintenance required
Kubernetes (To be updated)Production (nodes>10)
  1. Medium or large cluster needed
  2. Scalability is major concern
  3. Familiar with Kubernetes or orchestration
  4. Professional operation resources available
Direct Deploy (To be updated)Demo / Experimental
  1. Additional customization needed
  2. Familiar with Vue.js or Go
  3. Willing to work with source code
- + diff --git a/docs/.vuepress/dist/en/guide/installation/kubernetes.html b/docs/.vuepress/dist/en/guide/installation/kubernetes.html index a0d43fb..108b0f3 100644 --- a/docs/.vuepress/dist/en/guide/installation/kubernetes.html +++ b/docs/.vuepress/dist/en/guide/installation/kubernetes.html @@ -38,7 +38,7 @@ } - +

Kubernetes


Kubernetes

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/guide/monitoring/index.html b/docs/.vuepress/dist/en/guide/monitoring/index.html index b58d9f9..5138f25 100644 --- a/docs/.vuepress/dist/en/guide/monitoring/index.html +++ b/docs/.vuepress/dist/en/guide/monitoring/index.html @@ -38,7 +38,7 @@ } - +

Monitoring


Monitoring

NOTE

This functionality is for Pro Editionopen in new window only.

Crawlab Proopen in new window supports performance monitoring, which means you can use Crawlab Pro to monitor the performance of your nodes.

Performance Metrics Overview

  1. Go to the Metrics page
    metrics-menu
  2. You can see the snapshots of the performance metrics of all nodes
    metrics-overview

Performance Metrics Detail

  1. Go to the Metrics Deail page by clicking on View button in the Metrics page
    view-button
  2. You can see the performance metrics of the selected node
    metrics-detail
  3. You can switch the metrics source by selecting the Metrics Source dropdown
    metrics-source
  4. You can select the time range/unit by selecting the Time Range dropdown
    time-range
    and Time Unit
    time-unit
  5. You can check or uncheck metrics on the left panel to show/hide them on the right panel
    metrics-panel
- + diff --git a/docs/.vuepress/dist/en/guide/node/index.html b/docs/.vuepress/dist/en/guide/node/index.html index d8b6eb7..0583f3a 100644 --- a/docs/.vuepress/dist/en/guide/node/index.html +++ b/docs/.vuepress/dist/en/guide/node/index.html @@ -38,7 +38,7 @@ } - +

Node


Node

A node is a Crawlab instance that runs crawling tasks or provides other functionalities. You can basically regard a node as a server.

There are two types of nodes, each of which serves different functionalities.

  1. Master Node
  2. Worker Node

Note

Of course you can set up multiple Crawlab instances (nodes) on a server, but that is NOT recommended as a single instance (node) on a server normally suffices.

Master Node

Master Node is the control center of the whole distributed system in Crawlab. It acts like the brain of a human body. Master Node assigns tasks to Worker Nodes or itself, and manages them. It also deploys and distributes spider files to other nodes. Furthermore, it provides APIs to the frontend application and handles communication between each node.

Note

There is only ONE Master Node in Crawlab.

Worker Node

Worker Node is a Crawlab instance dedicated for running crawling tasks. Normally, a single node or server can be limited to its computing power and resources including CPUs, memory and network IO. Therefore, the number of Worker Nodes can be increased in order to scale up the throughput of data collection and improve the overall crawling performance of the distributed system.

Tips

There can be none (SND) or multiple Worker Nodes (MND) in Crawlab.

Topology

Check Node Status

In Nodes page, you can view the status of a node whether it is online of offline.

Enable/Disable

You can enable or disable nodes to run tasks by toggling the switch button of Enabled attribute in Nodes page and node detail page.

Set Max Runners

A node can run multiple tasks at the same time. The number of concurrent tasks is controlled by Max Runners of a node. It can be configured in the node detail page.

Set Basic Info

Basic info such as node name, IP, MAC address can be set in the node detail page.

Add Node

You can refer to Set up Worker Nodes in Multi-Node Deployment (MND) of Docker Installation to add new nodes.

- + diff --git a/docs/.vuepress/dist/en/guide/notifications/index.html b/docs/.vuepress/dist/en/guide/notifications/index.html index 64343d8..133b207 100644 --- a/docs/.vuepress/dist/en/guide/notifications/index.html +++ b/docs/.vuepress/dist/en/guide/notifications/index.html @@ -38,7 +38,7 @@ } - +

Notifications


Notifications

NOTE

This functionality is for Pro Editionopen in new window only.

Crawlab allows users to receive email or mobile notifications.

Email

  1. Navigate to Notifications page
    notifications-menu.png
  2. Click a notification config of Email type
  3. Fill in the configuration form
    email-config.png
  4. Click on Save button

SMTP configurations:

  • SMTP Server: SMTP server address
  • SMTP Port: SMTP server port
  • SMTP User: SMTP server username
  • SMTP Password: SMTP server password
  • Sender Email: SMTP server sender email
  • Sender Identity: SMTP server sender identity
  • To: Recipient email
  • CC: CC email

Mobile

  1. Navigate to Notifications page
    notifications-menu.png
  2. Click a notification config of Mobile type
  3. Fill in the configuration form
    mobile-config.png
  4. Click on Save button

Tips

Please refer to related documentation for how to get webhook tokens.

Trigger

  1. Navigate to the Notifications page
    notifications-menu.png
  2. Click on the Trigger tab
  3. Select the event types you want to trigger

Template

  1. Navigate to Notifications page
    notifications-menu.png
  2. Click a notification config of any type
  3. Click on Template tab
    template.png

Tips

To understand the syntax and variables of templates, please refer to template-parseropen in new window

- + diff --git a/docs/.vuepress/dist/en/guide/permissions/index.html b/docs/.vuepress/dist/en/guide/permissions/index.html index 83a5a4c..ba8c441 100644 --- a/docs/.vuepress/dist/en/guide/permissions/index.html +++ b/docs/.vuepress/dist/en/guide/permissions/index.html @@ -38,7 +38,7 @@ } - +

Permissions Management


Permissions Management

NOTE

This functionality is for Pro Editionopen in new window only.

Crawlab Proopen in new window supports a RBACopen in new window -based permissions management, which means you can use Crawlab Pro to manage the Permissions of your users via Roles.

Permissions

Permissions in Crawlab Pro are the basic unit of user access control.

Types of permissions

Types of permissions are as below:

  • Action: Specific actions that a role can perform, such as View, Edit, Delete, Create, etc.
  • Page: Specific pages that a role can access, such as Spiders, Tasks, Nodes, etc.
  • Data: Specific data records that a role can access, such as Spiders attributed to a specific user.

Permission fields

Fields of permissions are as below:

  • Type: Type of permission, Action, Page, or Data.
  • Target: Regex pattern of the targets, where the permission should operate on.
  • Allow: Regex pattern of allowed items.
  • Deny: Regex pattern of denied items.

Create a permission

  1. Go to the Permissions page by clicking the Permissions button in the sidebar.
    permissions-menu
  2. Click the New Permission button
    permissions-create
  3. Enter necessary info of the new permission and click Confirm button
    permissions-create-form

Delete a permission

  1. Go to the Permissions page by clicking the Permissions button in the
    permissions-menu
  2. Click the Delete button of the permission you want to delete
    delete-button

Roles

Roles in Crawlab Pro can be defined by admin users. Roles are associated with a set of permissions, and can be assigned to users.

Create a Role

  1. Go to the Roles page by clicking the navigation button on the left sidebar
    roles-menu
  2. Click the New Role button
    roles-create
  3. Enter necessary info of the new role and click Confirm button
    roles-create-form

Delete a role

  1. Go to the Roles page by clicking the Roles button in the
    roles-menu
  2. Click the Delete button of the role you want to delete
    delete-button
  1. Go to the Permissions tab in the Role Detail page by clicking the View permissions button.
    view-permissions-button
  2. Click on Link Permissions button.
    link-permissions-button
  3. Select the permissions you want to link/unlink to the role, and click Confirm button.
    link-permissions-form
  1. Go to the Permissions tab in the Role Detail page by clicking the View users button.
    view-users-button
  2. Click on Link Users button.
    link-users-button
  3. Select the users you want to link/unlink to the role, and click Confirm button.
    link-users-form
- + diff --git a/docs/.vuepress/dist/en/guide/plugin/index.html b/docs/.vuepress/dist/en/guide/plugin/index.html index d5847f2..ab2641e 100644 --- a/docs/.vuepress/dist/en/guide/plugin/index.html +++ b/docs/.vuepress/dist/en/guide/plugin/index.html @@ -38,7 +38,7 @@ } - +

Plugin


Plugin

Plugin is an extension which can extend beyond existing functionalities and features. In Crawlab, the Plugin Framework is in place for users to customize their web crawler management platforms.

Why Plugin

Why don't we just hack around the source code in Crawlab when customization is needed? The reason is the concern for Maintainability. When you change some code of core modules in Crawlab, you might risk your project's maintainability because there will be upgrades in the future, which would very likely break your current customization.

A well-designed plugin is less likely to be tightly coupled with Crawlab, so that updates in Crawlab will not significantly affect the plugin. Plugins are pluggable and easy to be installed or uninstalled.

Plugin Framework

Plugin Framework is embedded in Crawlab which manages official and third-party plugins. Crawlab users can develop plugins based on Crawlab Plugin Framework (CPF).

Official Plugins

There are some public official plugins maintained by Crawlab Teamopen in new window. The GitHub repos of official Crawlab plugins are normally located in Crawlab Team's repositoriesopen in new window, each of which has a prefix plugin-.

NameDescription仓库链接
plugin-notificationSending alerts and notifications such as emails and mobile push notificationsLinkopen in new window
plugin-dependencyInstalling and managing dependencies and running environmentLinkopen in new window
plugin-spider-assistantProviding advanced web crawler features such as framework support (e.g. Scrapy, etc.)Linkopen in new window

Install Plugin

Tips

After a plugin is installed, you should refresh page in your web browser in order for plugin UI components to display.

There are several ways of installing plugins in Crawlab.

Install Official Plugins

You can install official plugins by only input the plugin name in Install Plugin dialog.

  1. Navigate to Plugins.
  2. Choose Public.
  3. Click Install button on plugins you would like to install.

Install by Git

If you know the git url of a Crawlab plugin, you can install it through git url.

  1. Navigate to Plugins.
  2. Choose Type as Git.
  3. Enter the url of the plugin in the field Install URL.
  4. Click Confirm.

Install by Local

Note

This method is recommended only when you are developing Crawlab with source code.

  1. Navigate to Plugins.
  2. Choose Type as Local.
  3. Enter local path of the plugin in the field Install Path.
  4. Click Confirm.

Installation Source

Note

Installation Source is only for official plugins.

The default installation source of official plugins is GitHub. But GitHub is not always the best Source to access. For example, if you are in Mainland China, accessing GitHub can sometimes be slow; then you can choose Gitee as Source of official plugins, which will largely speed up plugin installation.

Uninstall Plugin

You can uninstall a plugin by clicking Delete button on the right in Plugins page.

Start/Stop

You can start or stop a plugin by clicking Start or Stop button on the right in Plugins page.

- + diff --git a/docs/.vuepress/dist/en/guide/plugin/plugin-dependency.html b/docs/.vuepress/dist/en/guide/plugin/plugin-dependency.html index aeaabe2..c80b8e1 100644 --- a/docs/.vuepress/dist/en/guide/plugin/plugin-dependency.html +++ b/docs/.vuepress/dist/en/guide/plugin/plugin-dependency.html @@ -38,7 +38,7 @@ } - +

plugin-dependency


plugin-dependency

plugin-dependencyopen in new window is a plugin that manages dependencies in Crawlab. For example, your Python crawlers may need to use libraries such as selenium or sqlalchemy apart from pre-installed libraries in Crawlab. With plugin-dependency, you can easily install and manage your dependencies and libraries on web UI in Crawlab.

Available Dependency Frameworks

  • Python
  • Node.js

Search and Install Dependencies

You can search and install dependencies on Crawlab Web UI with plugin-dependency, just like in popular IDEs such as JetBrains IDEA and VS Code.

  1. Navigate to the dependency framework page, e.g. Python.
  2. Click Installable button.
  3. Type in keyword for searching in the search input on the top left.
  4. Click search icon button.
  5. Click Install button on the right of the plugins you'd like to install.

Uninstall Dependencies

Uninstalling dependencies are also available.

  1. Navigate to the dependency framework page, e.g. Python.
  2. Click Installed button.
  3. Type in keyword for searching in the search input on the top left.
  4. Click search icon button.
  5. Click Uninstall button on the right of the plugins you'd like to uninstall.

View Tasks

You may want to check if your installation or uninstallation is successful or not, which can be achieved by viewing tasks following steps below.

  1. Navigate to the dependency framework page, e.g. Python.
  2. Click Tasks button.
  3. You can view logs of each task by clicking Logs button.
- + diff --git a/docs/.vuepress/dist/en/guide/plugin/plugin-notification.html b/docs/.vuepress/dist/en/guide/plugin/plugin-notification.html index c42c1ec..e2be06e 100644 --- a/docs/.vuepress/dist/en/guide/plugin/plugin-notification.html +++ b/docs/.vuepress/dist/en/guide/plugin/plugin-notification.html @@ -38,7 +38,7 @@ } - +

plugin-notification


plugin-notification

plugin-notificationopen in new window is a Crawlab plugin that allows users to send and receive notifications from Crawlab using email or mobile applications (e.g. WeChat, DingTalk).

Notification Type

There are 2 types of notifications in plugin-notification.

  • Mail: Sending notifications via email.
  • Mobile: Sending notifications via mobile webhooks.

Triggers

plugin-notification allows users to set triggers in order to configure when to send notifications.

You can follow the below steps to configure triggers.

  1. Navigate to Notifications page.
  2. Navigate to notification detail page by clicking the name or View button on the right.
  3. Click Triggers tab.
  4. Select triggers for sending notifications.

Template

plugin-notification allows users to customize notification content.

You can follow the below steps to customize content.

  1. Navigate to Notifications page.
  2. Navigate to notification detail page by clicking the name or View button on the right.
  3. Click Template tab.
  4. Edit template.
- + diff --git a/docs/.vuepress/dist/en/guide/plugin/plugin-spider-assistant.html b/docs/.vuepress/dist/en/guide/plugin/plugin-spider-assistant.html index 6a6de24..7ee6c80 100644 --- a/docs/.vuepress/dist/en/guide/plugin/plugin-spider-assistant.html +++ b/docs/.vuepress/dist/en/guide/plugin/plugin-spider-assistant.html @@ -38,7 +38,7 @@ } - +

plugin-spider-assistant


plugin-spider-assistant

plugin-spider-assistantopen in new window is a Crawlab plugin that provides assistance in spider management. It allows users to view and manage items in spider frameworks.

Spider Frameworks

NameLanguageViewManage
Scrapyopen in new windowPython
Collyopen in new windowGo
WebMagicopen in new windowJava
DotnetSpideropen in new windowC#

How to use

  1. Navigate to spider detail page.
  2. Click Assistant tab.
  3. You are now able to view info of detected spider framework.
- + diff --git a/docs/.vuepress/dist/en/guide/project/index.html b/docs/.vuepress/dist/en/guide/project/index.html index 751c040..0e213aa 100644 --- a/docs/.vuepress/dist/en/guide/project/index.html +++ b/docs/.vuepress/dist/en/guide/project/index.html @@ -38,7 +38,7 @@ } - +

Project


Project

A project is a group of spiders that are normally closely related and mostly crawl sites or data in the same category or industry. Therefore, you can regard projects as a method of grouping spiders together so that they could be better managed.

It is in one-to-many relationship with spiders.

You can link a spider to a project by either,

  1. selecting Project in the spider detail page, or
  2. selecting Project in the create new spider dialog.

View Spiders

Navigate to Spiders tab in the project detail page.

- + diff --git a/docs/.vuepress/dist/en/guide/quick-start.html b/docs/.vuepress/dist/en/guide/quick-start.html index 18e8cf6..670a146 100644 --- a/docs/.vuepress/dist/en/guide/quick-start.html +++ b/docs/.vuepress/dist/en/guide/quick-start.html @@ -38,7 +38,7 @@ } - +

Quick Start


Quick Start

The quickest way to install Crawlab is Docker. If you are not familiar with Docker, you can refer to Docker Official Siteopen in new window and install it on your local machine.

Pull Images

Make sure you have installed Docker, and then pull the image of Crawlab and MongoDB.

docker pull crawlabteam/crawlab
@@ -70,6 +70,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/schedule/index.html b/docs/.vuepress/dist/en/guide/schedule/index.html index c7bf3ef..b822783 100644 --- a/docs/.vuepress/dist/en/guide/schedule/index.html +++ b/docs/.vuepress/dist/en/guide/schedule/index.html @@ -38,7 +38,7 @@ } - +

Schedule


Schedule

Most of the time, we may need to periodically run crawling tasks for a spider. Now you need a schedule.

The concept schedule in Crawlab is similar to crontabopen in new window in Linux. It is a long-existing job that runs spider tasks in a periodical way.

Tips

If you would like to configure a web crawler that automatically runs crawling tasks every day/week/month, you should probably set up a schedule. Schedule is the right way to automate things, especially for spiders that crawl incremental content.

Create Schedule

  1. Navigate to Schedules page.
  2. Click New Schedule button on the top left.
  3. Enter basic info including Name, Cron Expressionopen in new window and Spider.
  4. Click Confirm.

The created schedule is enabled by default. Once you created a schedule which is already enabled, it should trigger a task on time according to its cron expression you have set.

Tips

You can debug whether the schedule module works in Crawlab by creating a new schedule with Cron Expression as * * * * *, which means "every minute", so that you can check if a task will be triggered when the next minute starts.

Enable/Disable Schedule

You can enable or disable schedules by toggling the switch button of Enabled attribute in Schedules page and schedule detail page.

Cron Expression

Cron Expression is a simple and standard format to describe the periodicity of tasks. It is the same as the format in Linux crontab.

*    *    *   *    *  Command_to_execute
@@ -63,6 +63,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/guide/spider/file-editor.html b/docs/.vuepress/dist/en/guide/spider/file-editor.html index e2834c0..1cb1295 100644 --- a/docs/.vuepress/dist/en/guide/spider/file-editor.html +++ b/docs/.vuepress/dist/en/guide/spider/file-editor.html @@ -38,7 +38,7 @@ } - +

File Editor


File Editor

Crawlab allows users to edit files in the browser. This is useful for editing files such as settings.py and items.py in the spider.

Open File

  1. Navigate to Files tab in spider detail page.
    files-tab
  2. Double-click the file you want to edit.
    files-sidebar
  3. The file should be opened in the editor.
    file-editor

Edit File

  1. Make changes to the file.

Save File

  1. Press Ctrl + S or click Save button in the nav bar to save the file.
    save-btn

Move File

  1. Drag and drop the file to the folder you want to move to.

Rename File

  1. Right-click the file and select Rename.
    rename

Duplicate File

  1. Right-click the file and select Duplicate.
    duplicate

Delete File

  1. Right-click the file and click Delete in the context menu.
    delete-file
- + diff --git a/docs/.vuepress/dist/en/guide/spider/git.html b/docs/.vuepress/dist/en/guide/spider/git.html index 79cdd25..54bf7e4 100644 --- a/docs/.vuepress/dist/en/guide/spider/git.html +++ b/docs/.vuepress/dist/en/guide/spider/git.html @@ -38,7 +38,7 @@ } - +

Git Integration


Git Integration

Crawlab allows users to configure a Git repository for version control of their spiders. This is very useful for team collaboration.

Configure Git Repository

  1. Navigate to the Git tab on the spider details page.
  2. Enter the URL of the Git repository, and the system will detect whether it's HTTPS or SSH.
  3. Enter the username and password for the Git repository or SSH key.
  4. Click the Save button.

Switch Branch

  1. Click the Checkout button.
  2. Select the branch you want to check out.
  3. Click the Confirm button.

Pull Code

  1. Click the Pull button and confirm.
  2. The system will pull the code from the remote repository.

Configure Auto Pull

  1. Check the Auto Pull option.
  2. Select the interval for code pulling.
  3. For spiders with the Auto Pull option checked, the system will automatically pull the code every minute.

Commit Changes

  1. Select the Changes tab.
  2. Check the files you want to commit.
  3. Click the Commit button and confirm.
- + diff --git a/docs/.vuepress/dist/en/guide/spider/index.html b/docs/.vuepress/dist/en/guide/spider/index.html index f82cad0..41c23be 100644 --- a/docs/.vuepress/dist/en/guide/spider/index.html +++ b/docs/.vuepress/dist/en/guide/spider/index.html @@ -38,7 +38,7 @@ } - +

Spider


Spider

Spider is the basic unit of web crawler programs in Crawlab. You can regard it as a web crawling software project consisted of code and files, e.g. a Scrapy project. Please note that the term project mentioned here is not the same as the basic concept Project in Crawlab.

Note

The concept Spider is so important in Crawlab that we strongly recommend you read through this section.

Typical Process

Below is a typical process for users to play with spiders in Crawlab.

Create Spider

  1. Navigate to Spiders page, and click New Spider button on the top left.
  2. Enter relevant info including Name and Execute Command.
  3. Click Confirm.

Execute Command is the base command that will be executed when running the spider, e.g. scrapy crawl myspider, and it's the bash/shell command that will be executed when running the spider.

Upload Spider

There are several ways to upload spider files.

Upload Folder

  1. Navigate to spider detail page.
  2. Click Files tab.
  3. Click Upload button in the nav bar.
  4. Choose Folder.
  5. Click Click to Select Folder to Upload.
  6. Choose the folder where spider files are located.
  7. Click Confirm.

Upload Files

  1. Navigate to spider detail page.
  2. Click Files tab.
  3. Click Upload button in the nav bar.
  4. Choose Files.
  5. Drag and drop spider files into the drop zone, or click the drop zone and select files.
  6. Click Confirm.

Upload Files (Drag & Drop)

  1. Navigate to spider detail page.
  2. Click Files tab.
  3. Drag and drop spider files or folders into folders on file navigator on the left.

Run Spider

You can follow the steps below to run a spider.

  1. If in spider detail page, click Run button with play icon in the nav bar.
  2. If in Spiders page, click Run button with play icon on the right.
  3. Select appropriate settings for running spider.
  4. Click Confirm.

Here is the explanation of settings for running a spider.

  • Command: Actual cmd/bash/shell base command that will be executed.
  • Param: Actual parameters/arguments passed to Command.
  • Mode: Task running mode. Default to Random Node.
  • Priority: Task priority. Default to 5.

Entity Relationships

- + diff --git a/docs/.vuepress/dist/en/guide/spider/integration.html b/docs/.vuepress/dist/en/guide/spider/integration.html index 0153065..2f4d4cb 100644 --- a/docs/.vuepress/dist/en/guide/spider/integration.html +++ b/docs/.vuepress/dist/en/guide/spider/integration.html @@ -38,7 +38,7 @@ } - +

Data Integration


Data Integration

You can integrate your spiders with Crawlab SDK. This allows you to view scraped results visually on Crawlab.

Crawlab SDK supports integration with various web crawler frameworks including Scrapy, and programming languages including Python, Node.js, Go.

NOTE

By default, Crawlab SDK is installed in the base image of Crawlab. You can also install it manually if you are not using Crawlab Docker image.

Scrapy

  1. Make sure you have created a Scrapy spider on Crawlab.
  2. Add crawlab.CrawlabPipeline to Item_PIPELINES in settings.py file.
    ITEM_PIPELINES = {
    @@ -64,6 +64,6 @@
       
     
- + diff --git a/docs/.vuepress/dist/en/guide/spider/selenium.html b/docs/.vuepress/dist/en/guide/spider/selenium.html new file mode 100644 index 0000000..a030399 --- /dev/null +++ b/docs/.vuepress/dist/en/guide/spider/selenium.html @@ -0,0 +1,99 @@ + + + + + + + + Selenium Spider Integration + + + + + + +

Selenium Spider Integration


Selenium Spider Integration

Introduction to Selenium

Seleniumopen in new window is a tool primarily used for web application testing, but it can also be used to write web scrapers. Unlike traditional HTTP request libraries (such as Requests), Selenium allows you to simulate browser behavior and automate the browser to gather data. This is particularly useful for scraping dynamic web pages that require JavaScript rendering.

Integrating Selenium Spider in Crawlab

Below, we will explain how to integrate a Selenium spider into Crawlab and display the scraping results in the Crawlab frontend interface. We will demonstrate the process using the example of scraping 36kr (36氪) website.

Creating the Spider

In the Crawlab spider list, create a spider named "36kr" with the execution command python main.py.

Editing the Spider File

Create and open the main.py file and enter the following content:

from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from crawlab import save_item
+
+# create web driver with chrome
+chrome_options = Options()
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+browser = webdriver.Chrome(options=chrome_options)
+
+# navigate to the news list page
+browser.get('https://36kr.com/information/web_news/')
+
+# get article items
+items = browser.find_elements(by=By.CSS_SELECTOR, value='.information-flow-list > .information-flow-item')
+
+# iterate through items
+for item in items:
+    # extract fields
+    el_title = item.find_element(by=By.CSS_SELECTOR, value='.article-item-title')
+    title = el_title.text
+    url = el_title.get_attribute('href')
+    topic = item.find_element(by=By.CSS_SELECTOR, value='.kr-flow-bar-motif > a').text
+    description = item.find_element(by=By.CSS_SELECTOR, value='.article-item-description').text
+    try:
+        pic_url = item.find_element(by=By.CSS_SELECTOR, value='.article-item-pic > img').get_attribute('src')
+    except:
+        pic_url = None
+
+    # save to Crawlab
+    save_item({
+        'title': title,
+        'url': url,
+        'topic': topic,
+        'description': description,
+        'pic_url': pic_url,
+    })
+

In this code, we define the chrome_options for the Chrome browser and include the following important parameters:

Note

These parameters are crucial; otherwise, the script may not run correctly in Crawlab!

chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+

Finally, we use the save_item method from the Crawlab SDK to save the scraping results obtained by the web scraper.

Running the Spider

Run the "36kr" spider in Crawlab to obtain the scraping results.

+ + + diff --git a/docs/.vuepress/dist/en/guide/task/index.html b/docs/.vuepress/dist/en/guide/task/index.html index 0d8414b..49cca87 100644 --- a/docs/.vuepress/dist/en/guide/task/index.html +++ b/docs/.vuepress/dist/en/guide/task/index.html @@ -38,7 +38,7 @@ } - +

Task


Task

A task is a process triggered by a spider which crawls data from websites, performs specific operations, or serves other functionalities. It is the basic unit of the execution process of spiders.

In Crawlab, you can not only run tasks through only a single click, but also be able to visually view task info such as stats, realtime logs and crawled data. Furthermore, you can set Priority of tasks in order to determine their execution sequence.

Run Task

You can either run a task from spider, or follow the steps below.

  1. Navigate to Tasks page.
  2. Click New Tasks button on the top left.
  3. Select Spider and choose other settings.
  4. Click Confirm.

Restart Task

  1. Navigate to Tasks page.
  2. Click Restart button on the right.

Monitor Task

Crawlab provides task monitoring functionalities to allow you to closely watch the results and performance of your crawling tasks.

View Logs

You can view realtime logs in Crawlab.

  1. Navigate to task detail page.
  2. Click Logs tab.

View Data

You can view crawled data in realtime.

  1. Navigate to task detail page.
  2. Click Data tab.

Cancel Task

Once a task is Pending or Running, you can cancel it by either

  1. clicking on Cancel button on the right in Tasks page, or
  2. clicking on Cancel button on the nav bar in task detail page.
- + diff --git a/docs/.vuepress/dist/en/guide/token/index.html b/docs/.vuepress/dist/en/guide/token/index.html index 9c28116..fe6fdc7 100644 --- a/docs/.vuepress/dist/en/guide/token/index.html +++ b/docs/.vuepress/dist/en/guide/token/index.html @@ -38,7 +38,7 @@ } - +


- + diff --git a/docs/.vuepress/dist/en/guide/user/index.html b/docs/.vuepress/dist/en/guide/user/index.html index a070069..8b931e9 100644 --- a/docs/.vuepress/dist/en/guide/user/index.html +++ b/docs/.vuepress/dist/en/guide/user/index.html @@ -38,7 +38,7 @@ } - +

User


User

Users in Crawlab are equal to user accounts in most admin systems. User accounts can be created, updated and deleted.

Admin User

By default, an admin user will be created when Crawlab is initialized. Below is the default username and password.

  • Username: admin
  • Password: admin

Create User

  1. Navigate to Users page, and click New User button on the top left.
  2. Enter relevant info including Username and Password.
  3. Click Confirm.

Change User Password

  1. Navigate to user detail page.
  2. Click Change Password button.
  3. Enter new password and confirm.
- + diff --git a/docs/.vuepress/dist/en/index.html b/docs/.vuepress/dist/en/index.html index be72b59..3b2fd91 100644 --- a/docs/.vuepress/dist/en/index.html +++ b/docs/.vuepress/dist/en/index.html @@ -38,7 +38,7 @@ } - +

Hello

Documentation for distributed web crawler management platform Crawlab

- + diff --git a/docs/.vuepress/dist/en/migration/index.html b/docs/.vuepress/dist/en/migration/index.html index b6e9f58..7302157 100644 --- a/docs/.vuepress/dist/en/migration/index.html +++ b/docs/.vuepress/dist/en/migration/index.html @@ -38,7 +38,7 @@ } - +

Upgrade & Migration


Upgrade & Migration

Since the first release of Crawlab, there were a number of major releases whose fundamental architectures are quite different. Therefore, if you would like to migrate to the latest version from old versions, you can refer to the migration guide below.

v0.6

Crawlab v0.6 is a major release with enhanced stability, scalability and functionalities. If you would like to upgrade to this version, please refer to v0.6 Migration Guide.

v0.5

Crawlab v0.5 is the first stable version. For details, please refer to v0.5 Documentationopen in new window .

Feature Comparisons

Featurev0.6v0.5
Node Management
Spider Deployment
Spider Code Editing
Spider Stats
Framework Integration
Data Storage Integration
Git Integration
Task Scheduling
Task Logging
Task Stats
Scheduled Tasks
User Management
Dependency Management
Notification
API
CLI
SDK
Customized Plugins
Configurable Spiders❌ (Temporary)
- + diff --git a/docs/.vuepress/dist/en/migration/v0.6.html b/docs/.vuepress/dist/en/migration/v0.6.html index 47761de..0dd7e6a 100644 --- a/docs/.vuepress/dist/en/migration/v0.6.html +++ b/docs/.vuepress/dist/en/migration/v0.6.html @@ -38,7 +38,7 @@ } - +

Migrate to v0.6.x from Old Versions


Migrate to v0.6.x from Old Versions

From v0.5.x

Suppose you have deployed Crawlab v0.5.x and have been running a bunch of spiders on production environment. You can follow the steps for migration as below.

  1. Install latest crawlab-sdkopen in new window
  2. Execute the CLI command below to migrate your spider data to v0.6.x
    crawlab migrate \
    @@ -59,6 +59,6 @@
       
     
- + diff --git a/docs/.vuepress/dist/en/principle/architecture/index.html b/docs/.vuepress/dist/en/principle/architecture/index.html index d99b96c..5110952 100644 --- a/docs/.vuepress/dist/en/principle/architecture/index.html +++ b/docs/.vuepress/dist/en/principle/architecture/index.html @@ -38,7 +38,7 @@ } - +

Architecture


Architecture

The architecture diagram of Crawlab is as below.

Architecture

As a distributed system, Crawlab is consisted of several modules (or layers), including Master Node, Worker Nodes, Database , File System and Frontend. Each module (apart from Master Node for now) is scalable so that the whole distributed system can be flexibly scaled in order to support high-performance requirements.

For example, crawling tasks are assigned to the Worker Node Cluster which can adjust itself with suitable number of nodes if resources needed change.

- + diff --git a/docs/.vuepress/dist/en/principle/core-modules/index.html b/docs/.vuepress/dist/en/principle/core-modules/index.html index 9ea360d..4d41064 100644 --- a/docs/.vuepress/dist/en/principle/core-modules/index.html +++ b/docs/.vuepress/dist/en/principle/core-modules/index.html @@ -38,7 +38,7 @@ } - +

Core Modules


Core Modules

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/principle/core-modules/schedule/index.html b/docs/.vuepress/dist/en/principle/core-modules/schedule/index.html index f9db55f..fa0c0bf 100644 --- a/docs/.vuepress/dist/en/principle/core-modules/schedule/index.html +++ b/docs/.vuepress/dist/en/principle/core-modules/schedule/index.html @@ -38,7 +38,7 @@ } - +

Schedule


Schedule

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/principle/core-modules/spider/index.html b/docs/.vuepress/dist/en/principle/core-modules/spider/index.html index 6fc19da..41a5f82 100644 --- a/docs/.vuepress/dist/en/principle/core-modules/spider/index.html +++ b/docs/.vuepress/dist/en/principle/core-modules/spider/index.html @@ -38,7 +38,7 @@ } - +

Spider


Spider

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/principle/core-modules/task/index.html b/docs/.vuepress/dist/en/principle/core-modules/task/index.html index 7fdb5d0..ae97121 100644 --- a/docs/.vuepress/dist/en/principle/core-modules/task/index.html +++ b/docs/.vuepress/dist/en/principle/core-modules/task/index.html @@ -38,7 +38,7 @@ } - +

Task


Task

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/principle/database/index.html b/docs/.vuepress/dist/en/principle/database/index.html index 60e2803..6a2f8d7 100644 --- a/docs/.vuepress/dist/en/principle/database/index.html +++ b/docs/.vuepress/dist/en/principle/database/index.html @@ -38,7 +38,7 @@ } - +

Database


Database

Crawlab uses MongoDB, an open-source high-performance NoSQL database, as its operational database.

- + diff --git a/docs/.vuepress/dist/en/principle/database/mongodb.html b/docs/.vuepress/dist/en/principle/database/mongodb.html index bee7471..6b33538 100644 --- a/docs/.vuepress/dist/en/principle/database/mongodb.html +++ b/docs/.vuepress/dist/en/principle/database/mongodb.html @@ -38,7 +38,7 @@ } - +

MongoDB


MongoDB

MongoDBopen in new window is one of the most used NoSQL database which is schemaless and allows for rapid development because developers don't have to worry about database schema migration when data fields are changed.

The biggest reason of using MongoDB in Crawlab is its flexibility. As Crawlab is under fast development, most of the time the models and data fields need to be updated. Therefore, MongoDB is the best solution.

Crawled results data are also stored in MongoDB in the Community Edition. The database of results storage is the same as the operational data. If you would like to store results data in other types of databases (e.g. MySQL, PostgreSQL, ElasticSearch, etc.), or other databases or instances of MongoDB, you can check out the Professional Edition, which has more powerful features for production use.

- + diff --git a/docs/.vuepress/dist/en/principle/filesystem/index.html b/docs/.vuepress/dist/en/principle/filesystem/index.html index c7ccb21..6917d0a 100644 --- a/docs/.vuepress/dist/en/principle/filesystem/index.html +++ b/docs/.vuepress/dist/en/principle/filesystem/index.html @@ -38,7 +38,7 @@ } - +

File System


File System

Crawlab uses SeaweedFS as its distributed file system.

Below is how it works with files synchronization between the master node and worker nodes.

- + diff --git a/docs/.vuepress/dist/en/principle/filesystem/seaweedfs.html b/docs/.vuepress/dist/en/principle/filesystem/seaweedfs.html index 090ae42..cd6eb35 100644 --- a/docs/.vuepress/dist/en/principle/filesystem/seaweedfs.html +++ b/docs/.vuepress/dist/en/principle/filesystem/seaweedfs.html @@ -38,7 +38,7 @@ } - +

SeaweedFS


SeaweedFS

SeaweedFSopen in new window is an open-source distributed file system which is good at hosting small files. As Crawlab is a distributed crawler management platform, SeaweedFS serves as the medium for synchronizing files such as scripts and program files between nodes and instances.

For more details, please refer to the wikiopen in new window on GitHub.

- + diff --git a/docs/.vuepress/dist/en/principle/frontend/index.html b/docs/.vuepress/dist/en/principle/frontend/index.html index 954e70a..8d5053b 100644 --- a/docs/.vuepress/dist/en/principle/frontend/index.html +++ b/docs/.vuepress/dist/en/principle/frontend/index.html @@ -38,7 +38,7 @@ } - +

Frontend


Frontend

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/en/principle/index.html b/docs/.vuepress/dist/en/principle/index.html index 91e837f..79a823a 100644 --- a/docs/.vuepress/dist/en/principle/index.html +++ b/docs/.vuepress/dist/en/principle/index.html @@ -38,7 +38,7 @@ } - +

Introduction


Introduction

As developers, you might be interested in how Crawlab works. This section will introduce the fundamentals and basic technologies behind it.

This section is consisted of the topics.

- + diff --git a/docs/.vuepress/dist/en/principle/node/index.html b/docs/.vuepress/dist/en/principle/node/index.html index 164175f..4c09d27 100644 --- a/docs/.vuepress/dist/en/principle/node/index.html +++ b/docs/.vuepress/dist/en/principle/node/index.html @@ -38,7 +38,7 @@ } - +

Node


Node

A node is an instance that executes and manages crawler programs. It is the basic unit of resources in a distributed system.

There are two types of nodes:

If you wonder how nodes communicate between each other, you can refer to Node Communication.

- + diff --git a/docs/.vuepress/dist/en/principle/node/master-node.html b/docs/.vuepress/dist/en/principle/node/master-node.html index cf7815c..6815f12 100644 --- a/docs/.vuepress/dist/en/principle/node/master-node.html +++ b/docs/.vuepress/dist/en/principle/node/master-node.html @@ -38,7 +38,7 @@ } - +

Master Node


Master Node

A master node is the central part of the distributed system in Crawlab. It manages registration of worker nodes, assigns and schedule crawling tasks to worker nodes, and receives or transfers messages from or to worker nodes.

A master node is consisted of many submodules and is much more complicated than worker nodes. You can refer to the diagram below for the submodules in a master node.

- + diff --git a/docs/.vuepress/dist/en/principle/node/node-communication.html b/docs/.vuepress/dist/en/principle/node/node-communication.html index 3499c58..f5028cb 100644 --- a/docs/.vuepress/dist/en/principle/node/node-communication.html +++ b/docs/.vuepress/dist/en/principle/node/node-communication.html @@ -38,7 +38,7 @@ } - +

Node Communication


Node Communication

The master node communicates with worker nodes through gRPCopen in new window, a high-performance RPC framework.

The diagram below shows how the node communication works in Crawlab.

- + diff --git a/docs/.vuepress/dist/en/principle/node/worker-node.html b/docs/.vuepress/dist/en/principle/node/worker-node.html index 1eaaa0f..f827c59 100644 --- a/docs/.vuepress/dist/en/principle/node/worker-node.html +++ b/docs/.vuepress/dist/en/principle/node/worker-node.html @@ -38,7 +38,7 @@ } - +

Worker Node


Worker Node

A worker node is a kind of node whose main responsibility is task execution. It receives crawling tasks from the master node and executes processes that crawl data on the target websites. Therefore, you can think of worker nodes as executors. They do not handle API requests, task scheduling, cron and other functionalities on a master node.

You can refer below the submodules of worker nodes.

- + diff --git a/docs/.vuepress/dist/en/pro/index.html b/docs/.vuepress/dist/en/pro/index.html index 959ce73..de5580a 100644 --- a/docs/.vuepress/dist/en/pro/index.html +++ b/docs/.vuepress/dist/en/pro/index.html @@ -38,7 +38,7 @@ } - +

Introduction


Introduction

Crawlab Pro Edition is the enterprise product based on the core functionalities of Crawlab. It has additional powerful modules which would be suitable for production environment and has more scalability and robustness.

Demo

You can visit the Demoopen in new window to see how Crawlab Pro works.

Feature Comparison

FeatureProCommunity
Spider Management
Scheduled Tasks
Task Management
Data Source Integration
Notification
Performance Monitoring
Permission Management
Environment
Canvas(Low-Code Dev)

How to install

Installation for Crawlab Pro is quite simple and similar to the Community version.

Please refer to Install Crawlab Pro for more details.

License

You will need a license to use Crawlab Pro.

- + diff --git a/docs/.vuepress/dist/en/pro/installation.html b/docs/.vuepress/dist/en/pro/installation.html index 044ce3b..4e65f9d 100644 --- a/docs/.vuepress/dist/en/pro/installation.html +++ b/docs/.vuepress/dist/en/pro/installation.html @@ -5,7 +5,7 @@ - - +

Installation


Installation

Crawlab Pro is distributed via Dockeropen in new window. You can install it on any platform that supports Docker.

Prerequisites

Tips

If you don't know how to set up Docker environment, please refer to the Installation via Docker section.

  • Docker Environment. Please make sure you have installed Docker and Docker Compose on your machine.
  • License. You need to have a valid license to initialize and start Crawlab Pro containers.

Main Process

Tips

If you don't know how to obtain licenses, please refer to Crawlab Pro License for more information.

The main process of installing Crawlab Pro is quite similar to that of Crawlab Community except that it requires a license. You can refer to Crawlab Community Installation via Docker for more information.

  1. Install Dockeropen in new window and Docker-Composeopen in new window
  2. Pull Docker image of Crawlab Pro (and MongoDB if you have no external MongoDB instance)
  3. Create docker-compose.yml and make configurations (including licenses)
  4. Start Docker containers

Note

For following guidance, we will assume you have installed Docker and Docker-Compose, and already pulled Docker images.

Standalone-Node Deployment

Standalone-Node Deployment (SND) is similar to the configuration in Quick Start, and it is normally for demo purpose or managing a small number of crawlers. In SND, all Docker containers including Crawlab and MongoDB are in only a single machine, i.e. Master Node (see diagram above).

Create docker-compose.yml and enter the content below.

version: '3.3'
@@ -163,6 +163,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/en/pro/license.html b/docs/.vuepress/dist/en/pro/license.html index a88656a..a79d088 100644 --- a/docs/.vuepress/dist/en/pro/license.html +++ b/docs/.vuepress/dist/en/pro/license.html @@ -38,7 +38,7 @@ } - +

License


License

Crawlab Professional Edition is a commercial product. You need a valid license to use it.

How to Purchase

You can purchase a license through the following methods:

  1. Online purchase: https://www.crawlab.cn/en/pricesopen in new window
  2. Contact us: WeChat customer service account tikazyq1
- + diff --git a/docs/.vuepress/dist/index.html b/docs/.vuepress/dist/index.html index ca7ebdc..3069c88 100644 --- a/docs/.vuepress/dist/index.html +++ b/docs/.vuepress/dist/index.html @@ -38,7 +38,7 @@ } - +

Hello

Welcome to your VuePress site

- + diff --git a/docs/.vuepress/dist/sitemap.xml b/docs/.vuepress/dist/sitemap.xml index 3cd132f..122499e 100644 --- a/docs/.vuepress/dist/sitemap.xml +++ b/docs/.vuepress/dist/sitemap.xml @@ -1,3 +1,3 @@ -https://docs.crawlab.cn/2023-06-25T06:45:43.000Zdailyhttps://docs.crawlab.cn/en/2023-06-25T06:45:43.000Zdailyhttps://docs.crawlab.cn/zh/2023-06-25T06:45:43.000Zdailyhttps://docs.crawlab.cn/en/api/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/api/api-reference.html2023-06-18T14:17:20.000Zdailyhttps://docs.crawlab.cn/en/develop/2021-11-14T10:35:32.000Zdailyhttps://docs.crawlab.cn/en/develop/introduction.html2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/faq/dailyhttps://docs.crawlab.cn/en/guide/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/guide/quick-start.html2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/migration/2022-07-13T06:58:39.000Zdailyhttps://docs.crawlab.cn/en/migration/v0.6.html2022-07-13T08:52:44.000Zdailyhttps://docs.crawlab.cn/en/principle/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/pro/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/pro/installation.html2023-04-05T09:39:57.000Zdailyhttps://docs.crawlab.cn/en/pro/license.html2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/api/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/api/api-reference.html2023-06-18T14:17:20.000Zdailyhttps://docs.crawlab.cn/zh/develop/2021-11-14T10:35:32.000Zdailyhttps://docs.crawlab.cn/zh/develop/introduction.html2021-11-11T04:17:23.000Zdailyhttps://docs.crawlab.cn/zh/faq/dailyhttps://docs.crawlab.cn/zh/guide/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/quick-start.html2022-06-01T09:52:00.000Zdailyhttps://docs.crawlab.cn/zh/migration/2022-07-13T06:58:39.000Zdailyhttps://docs.crawlab.cn/zh/migration/v0.6.html2022-07-13T08:52:44.000Zdailyhttps://docs.crawlab.cn/zh/principle/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/pro/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/pro/installation.html2023-04-05T06:59:46.000Zdailyhttps://docs.crawlab.cn/zh/pro/license.html2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/develop/plugins/2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/guide/basic-tutorial/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/guide/cli/2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/guide/data-sources/2023-06-26T04:37:32.000Zdailyhttps://docs.crawlab.cn/en/guide/deps/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/guide/environment/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/2022-05-22T13:29:50.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/direct-deploy.html2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/docker.html2023-07-18T08:13:48.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/kubernetes.html2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/guide/monitoring/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/node/2022-10-24T04:35:58.000Zdailyhttps://docs.crawlab.cn/en/guide/notifications/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/permissions/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/project/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/plugin-dependency.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/plugin-notification.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/plugin-spider-assistant.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/schedule/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/file-editor.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/git.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/integration.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/selenium.html2023-07-23T06:58:17.000Zdailyhttps://docs.crawlab.cn/en/guide/task/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/token/2022-10-08T03:05:23.000Zdailyhttps://docs.crawlab.cn/en/guide/user/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/en/principle/architecture/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/database/2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/database/mongodb.html2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/filesystem/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/principle/filesystem/seaweedfs.html2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/frontend/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/node/2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/node/master-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/principle/node/node-communication.html2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/principle/node/worker-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/develop/plugins/2021-11-11T04:17:23.000Zdailyhttps://docs.crawlab.cn/zh/guide/basic-tutorial/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/cli/2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/guide/data-sources/2023-06-26T04:37:32.000Zdailyhttps://docs.crawlab.cn/zh/guide/deps/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/environment/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/2022-05-22T13:29:50.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/direct-deploy.html2021-11-10T13:45:23.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/docker.html2023-07-07T07:34:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/kubernetes.html2021-11-10T13:02:22.000Zdailyhttps://docs.crawlab.cn/zh/guide/monitoring/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/node/2022-10-24T04:35:58.000Zdailyhttps://docs.crawlab.cn/zh/guide/notifications/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/permissions/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/project/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/plugin-dependency.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/plugin-notification.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/plugin-spider-assistant.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/schedule/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/file-editor.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/git.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/integration.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/selenium.html2023-07-23T06:58:17.000Zdailyhttps://docs.crawlab.cn/zh/guide/task/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/token/2022-10-08T03:05:23.000Zdailyhttps://docs.crawlab.cn/zh/guide/user/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/principle/architecture/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/database/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/principle/database/mongodb.html2023-06-26T04:37:32.000Zdailyhttps://docs.crawlab.cn/zh/principle/filesystem/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/principle/filesystem/seaweedfs.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/principle/frontend/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/master-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/node-communication.html2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/worker-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/schedule/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/spider/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/task/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/spider/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/schedule/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/task/2022-04-23T09:39:50.000Zdaily \ No newline at end of file +https://docs.crawlab.cn/2023-06-25T06:45:43.000Zdailyhttps://docs.crawlab.cn/en/2023-06-25T06:45:43.000Zdailyhttps://docs.crawlab.cn/zh/2023-06-25T06:45:43.000Zdailyhttps://docs.crawlab.cn/en/api/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/api/api-reference.html2023-06-18T14:17:20.000Zdailyhttps://docs.crawlab.cn/en/develop/2021-11-14T10:35:32.000Zdailyhttps://docs.crawlab.cn/en/develop/introduction.html2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/faq/2023-07-23T07:24:12.000Zdailyhttps://docs.crawlab.cn/en/migration/2022-07-13T06:58:39.000Zdailyhttps://docs.crawlab.cn/en/migration/v0.6.html2022-07-13T08:52:44.000Zdailyhttps://docs.crawlab.cn/en/guide/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/guide/quick-start.html2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/principle/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/pro/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/pro/installation.html2023-07-23T07:24:12.000Zdailyhttps://docs.crawlab.cn/en/pro/license.html2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/api/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/api/api-reference.html2023-06-18T14:17:20.000Zdailyhttps://docs.crawlab.cn/zh/develop/2021-11-14T10:35:32.000Zdailyhttps://docs.crawlab.cn/zh/develop/introduction.html2021-11-11T04:17:23.000Zdailyhttps://docs.crawlab.cn/zh/faq/2023-07-23T07:24:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/quick-start.html2022-06-01T09:52:00.000Zdailyhttps://docs.crawlab.cn/zh/migration/2022-07-13T06:58:39.000Zdailyhttps://docs.crawlab.cn/zh/migration/v0.6.html2022-07-13T08:52:44.000Zdailyhttps://docs.crawlab.cn/zh/principle/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/pro/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/pro/installation.html2023-07-23T07:24:12.000Zdailyhttps://docs.crawlab.cn/zh/pro/license.html2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/develop/plugins/2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/guide/basic-tutorial/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/guide/cli/2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/guide/data-sources/2023-06-26T04:37:32.000Zdailyhttps://docs.crawlab.cn/en/guide/deps/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/guide/environment/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/2022-05-22T13:29:50.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/direct-deploy.html2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/docker.html2023-07-23T07:24:12.000Zdailyhttps://docs.crawlab.cn/en/guide/installation/kubernetes.html2021-11-13T10:46:48.000Zdailyhttps://docs.crawlab.cn/en/guide/monitoring/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/node/2022-10-24T04:35:58.000Zdailyhttps://docs.crawlab.cn/en/guide/notifications/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/permissions/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/plugin-dependency.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/plugin-notification.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/plugin/plugin-spider-assistant.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/project/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/schedule/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/file-editor.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/git.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/integration.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/en/guide/spider/selenium.html2023-07-23T06:58:17.000Zdailyhttps://docs.crawlab.cn/en/guide/task/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/guide/token/2022-10-08T03:05:23.000Zdailyhttps://docs.crawlab.cn/en/guide/user/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/en/principle/architecture/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/database/2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/database/mongodb.html2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/filesystem/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/principle/filesystem/seaweedfs.html2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/node/2022-05-19T13:45:12.000Zdailyhttps://docs.crawlab.cn/en/principle/node/master-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/principle/node/node-communication.html2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/en/principle/node/worker-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/principle/frontend/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/develop/plugins/2021-11-11T04:17:23.000Zdailyhttps://docs.crawlab.cn/zh/guide/basic-tutorial/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/cli/2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/guide/data-sources/2023-06-26T04:37:32.000Zdailyhttps://docs.crawlab.cn/zh/guide/deps/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/guide/environment/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/2022-05-22T13:29:50.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/direct-deploy.html2021-11-10T13:45:23.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/docker.html2023-07-23T07:24:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/installation/kubernetes.html2021-11-10T13:02:22.000Zdailyhttps://docs.crawlab.cn/zh/guide/monitoring/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/node/2022-10-24T04:35:58.000Zdailyhttps://docs.crawlab.cn/zh/guide/notifications/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/permissions/2023-07-08T10:15:57.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/plugin-dependency.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/plugin-notification.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/plugin/plugin-spider-assistant.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/guide/project/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/schedule/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/file-editor.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/git.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/integration.html2023-07-15T06:20:37.000Zdailyhttps://docs.crawlab.cn/zh/guide/spider/selenium.html2023-07-23T06:58:17.000Zdailyhttps://docs.crawlab.cn/zh/guide/task/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/guide/token/2022-10-08T03:05:23.000Zdailyhttps://docs.crawlab.cn/zh/guide/user/2022-05-21T07:28:12.000Zdailyhttps://docs.crawlab.cn/zh/principle/architecture/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/database/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/principle/database/mongodb.html2023-06-26T04:37:32.000Zdailyhttps://docs.crawlab.cn/zh/principle/filesystem/2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/principle/filesystem/seaweedfs.html2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/principle/frontend/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/2022-05-21T07:12:15.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/master-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/node-communication.html2022-10-22T04:42:19.000Zdailyhttps://docs.crawlab.cn/zh/principle/node/worker-node.html2022-10-23T08:22:30.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/schedule/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/spider/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/en/principle/core-modules/task/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/schedule/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/spider/2022-04-23T09:39:50.000Zdailyhttps://docs.crawlab.cn/zh/principle/core-modules/task/2022-04-23T09:39:50.000Zdaily \ No newline at end of file diff --git a/docs/.vuepress/dist/zh/api/api-reference.html b/docs/.vuepress/dist/zh/api/api-reference.html index de4ec19..efc398e 100644 --- a/docs/.vuepress/dist/zh/api/api-reference.html +++ b/docs/.vuepress/dist/zh/api/api-reference.html @@ -38,7 +38,7 @@ } - +


上次编辑于:
- + diff --git a/docs/.vuepress/dist/zh/api/index.html b/docs/.vuepress/dist/zh/api/index.html index 12e71b5..1512915 100644 --- a/docs/.vuepress/dist/zh/api/index.html +++ b/docs/.vuepress/dist/zh/api/index.html @@ -38,7 +38,7 @@ } - +

介绍


介绍

Crawlab 用户和开发者可以通过提供开放的 API 来集成自己的数据到 Crawlab 平台。

请参考以下内容。

- + diff --git a/docs/.vuepress/dist/zh/develop/index.html b/docs/.vuepress/dist/zh/develop/index.html index bd99819..968a324 100644 --- a/docs/.vuepress/dist/zh/develop/index.html +++ b/docs/.vuepress/dist/zh/develop/index.html @@ -38,7 +38,7 @@ } - +


- + diff --git a/docs/.vuepress/dist/zh/develop/introduction.html b/docs/.vuepress/dist/zh/develop/introduction.html index 03b2b18..5ce5639 100644 --- a/docs/.vuepress/dist/zh/develop/introduction.html +++ b/docs/.vuepress/dist/zh/develop/introduction.html @@ -38,7 +38,7 @@ } - +

介绍


介绍

🚧 Under construction

- + diff --git a/docs/.vuepress/dist/zh/develop/plugins/index.html b/docs/.vuepress/dist/zh/develop/plugins/index.html index 8c6b66f..230f320 100644 --- a/docs/.vuepress/dist/zh/develop/plugins/index.html +++ b/docs/.vuepress/dist/zh/develop/plugins/index.html @@ -38,7 +38,7 @@ } - +

开发插件


开发插件

🚧 Under construction

- + diff --git a/docs/.vuepress/dist/zh/faq/index.html b/docs/.vuepress/dist/zh/faq/index.html new file mode 100644 index 0000000..481b1a9 --- /dev/null +++ b/docs/.vuepress/dist/zh/faq/index.html @@ -0,0 +1,57 @@ + + + + + + + + FAQ + + + + + + +

FAQ


FAQ

Crawlab 是什么?

Crawlab 是一个开源的网络爬虫管理平台。它的设计目标是帮助用户更轻松地创建、管理和监控网络爬虫任务。Crawlab 提供了一个用户友好的图形界面,使用户可以通过简单的操作来配置爬虫任务、设置爬取规则、监视爬取状态以及查看爬取结果。

您可以查看 介绍章节 进行更多了解。

Crawlab 为什么可以执行不同编程语言和框架的爬虫?

Crawlab 执行爬虫任务是基于 Shell 命令。因此,如果环境允许,理论上能支持 Shell 命令的爬虫都可以在 Crawlab 中运行。

爬虫中的 执行命令参数 拼接起来就是爬虫任务实际的 Shell 命令。例如,某个爬虫的 执行命令python main.py ,参数为 spider1,那么爬虫任务的 Shell 命令就为 python main.py

为什么拉取 Crawlab 总是 v0.6.0 版本,而不是最新版本?

对于中国国内用户,很有可能您配置了阿里云镜像代理,请使用其他镜像代理,例如腾讯云镜像代理open in new window

Crawlab 支持 Scrapy 吗?

Crawlab 支持 Scrapy,而且有内置的 Pipeline 可以使用,只需要在 settings.pyITEM_PIPELINS 中加入 crawlab.CrawlabPipeline 即可集成。

详情请参考 爬虫集成

Crawlab 支持 Selenium 吗?

Crawlab 支持 Selenium 爬虫,详情参考 Selenium 爬虫集成

+ + + diff --git a/docs/.vuepress/dist/zh/guide/basic-tutorial/index.html b/docs/.vuepress/dist/zh/guide/basic-tutorial/index.html index 10e46ce..4abdb7a 100644 --- a/docs/.vuepress/dist/zh/guide/basic-tutorial/index.html +++ b/docs/.vuepress/dist/zh/guide/basic-tutorial/index.html @@ -38,7 +38,7 @@ } - +

快速教程


快速教程

您已经安装好了 Crawlab 并或许迫不及待的想开始使用它。但在此之前,建议您过一遍这篇快速教程。它将介绍一些基础知识,并让您熟悉 Crawlab 的部分主要功能。

介绍

本次教程中,我们将创建一个网络爬虫,以抓取 Zyteopen in new window (Scrapy 背后的公司) 提供的 模拟网站open in new window 上的名人名言。接着,我们将上传这个爬虫到 Crawlab,然后运行爬虫来抓取名人名言列表。最后,我们会在 Crawlab 中可视化的查看抓取到的数据。

我们将采用的框架是 Scrapyopen in new window。它是 Python 编写的最受欢迎的爬虫框架,使用起来非常方便,同时也具备很多强大的功能。

注意

我们假设您已经根据 快速开始 在本地安装好了 Crawlab。如果没有,请参考 快速开始 将其安装在您本地。

由于我们使用的是 Scrapy,请保证在进行任何操作前您已经安装了 Pythonopen in new window (>=3.6) 以及模块管理工具 pipopen in new window

创建爬虫

首先,我们将创建一个 Scrapy 项目,咱们从安装 Scrapy 开始。

pip install scrapy
@@ -90,6 +90,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/cli/index.html b/docs/.vuepress/dist/zh/guide/cli/index.html index d414ba3..f430892 100644 --- a/docs/.vuepress/dist/zh/guide/cli/index.html +++ b/docs/.vuepress/dist/zh/guide/cli/index.html @@ -38,7 +38,7 @@ } - +

命令行工具 (CLI)


命令行工具 (CLI)

命令行工具允许用户轻松管理 Crawlab,以及执行常规操作,例如上传爬虫。它是基于 Python 编写的,很容易安装。

安装

Crawlab 命令行工具集成于 Crawlab SDKopen in new window 。 您可以执行以下命令来安装 Crawlab 命令行工具。

pip install crawlab-sdk
@@ -67,6 +67,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/data-sources/index.html b/docs/.vuepress/dist/zh/guide/data-sources/index.html index b462271..f4ab33d 100644 --- a/docs/.vuepress/dist/zh/guide/data-sources/index.html +++ b/docs/.vuepress/dist/zh/guide/data-sources/index.html @@ -38,7 +38,7 @@ } - +

数据源


数据源

Crawlab 支持数据源集成,这意味着您可以使用 Crawlab 管理您的数据源,例如 MongoDB、MySQL、PostgreSQL、SQL Server 等。

支持的数据源

类别数据源支持
非关系型MongoDBopen in new window
非关系型ElasticSearchopen in new window
关系型MySQLopen in new window
关系型PostgreSQLopen in new window
关系型SQL Serveropen in new window
关系型CockroachDBopen in new window
关系型Sqliteopen in new window
流处理Kafkaopen in new window

新建数据源

  1. 导航至 数据源 页面
    data-sources-menu
  2. 点击 新建数据源 按钮
    new-data-source-button
  3. 选择 类型 作为数据源类型,并输入 名称 以及连接信息
    mongo-form
  4. 点击 保存 按钮

使用数据源

  1. 导航至 爬虫详情 页面
  2. 数据源 中选择相应的数据源
    mongo-data-source
  3. 点击 保存 按钮
  4. 在保存结果数据的地方,添加相应的集成代码 (参考下面的 爬虫代码例子)
  5. 运行爬虫,您将能在 数据 标签中看到结果数据
    results

爬虫代码例子

通用 Python 爬虫

crawlab-sdkopen in new window 的方法 save_item 可被调用来保存数据到对应的数据源。


@@ -62,6 +62,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/deps/index.html b/docs/.vuepress/dist/zh/guide/deps/index.html index 4b0e319..9b6d51b 100644 --- a/docs/.vuepress/dist/zh/guide/deps/index.html +++ b/docs/.vuepress/dist/zh/guide/deps/index.html @@ -38,7 +38,7 @@ } - +

依赖管理


依赖管理

Crawlab 允许用户为爬虫和任务安装和管理依赖。

页面菜单

展开左侧边栏上的 环境按钮 按钮,然后点击以下子菜单项。

menu.png

  • 设置: 全局依赖设置
  • Python: Python 依赖管理
  • Node.js: Node.js 依赖管理

安装依赖

  1. 导航到依赖管理页面 (Python/Node.js)
    deps-list.png
  2. 点击 可安装 按钮
    installable.png
  3. 输入搜索关键字并点击 搜索 按钮
    img.png
  4. 点击 安装 按钮
    install.png
  5. 选择 模式 (安装到哪些节点) 和 升级 (是否升级) 并点击 确认 按钮
    install-form.png

卸载依赖

  1. 导航到依赖管理页面 (Python/Node.js)
    deps-list.png
  2. 点击 卸载 按钮来卸载依赖
    uninstall.png
  3. 选择 模式 (卸载到哪些节点) 并点击 确认 按钮
    uninstall-form.png

设置

  1. 导航到设置页面
    settings-list.png
  2. 点击 配置 按钮
    edit.png
  3. 编辑配置信息并点击 确认 按钮
    settings.png

设置信息描述:

  • 命令: 安装或卸载依赖的执行命令, 例如 pip, /usr/local/bin/pip39, npm, yarn
  • 代理: 安装或卸载依赖的代理地址, 例如 https://registry.npm.taobao.org, https://pypi.tuna.tsinghua.edu.cn/simple

任务

  1. 导航到依赖管理页面 (Python/Node.js)
    deps-list.png
  2. 点击 任务 按钮
    task.png
  3. 您可以在任务列表中查看任务的执行状态
    tasks-list.png
  4. 点击 日志 按钮来查看任务的执行日志
    tasks-logs.png
  5. 您可以在任务日志中查看任务的执行日志
    tasks-logs-content.png
- + diff --git a/docs/.vuepress/dist/zh/guide/environment/index.html b/docs/.vuepress/dist/zh/guide/environment/index.html index 27a2b32..82f6d0c 100644 --- a/docs/.vuepress/dist/zh/guide/environment/index.html +++ b/docs/.vuepress/dist/zh/guide/environment/index.html @@ -38,7 +38,7 @@ } - +

环境变量


环境变量

注意

该功能仅适用于 Crawlab 专业版open in new window

Crawlab 允许用户在爬虫运行时设置环境变量。

设置环境变量

  1. 导航到 环境变量 页面
  2. 点击 新建环境变量 按钮
  3. 填写配置表单

调用环境变量

假设我们已经设置设置了一个环境变量,键为 FOO,值为 BAR,则我们可以在爬虫脚本中调用它,示例代码如下。

import os
@@ -56,6 +56,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/index.html b/docs/.vuepress/dist/zh/guide/index.html index aeb9fb9..e835a38 100644 --- a/docs/.vuepress/dist/zh/guide/index.html +++ b/docs/.vuepress/dist/zh/guide/index.html @@ -38,7 +38,7 @@ } - +

介绍


介绍

如果您已经了解 Crawlab 是什么,也知道它是做什么的,您可以直接到 快速开始安装 来安装和使用 Crawlab。

如果您对 Crawlab 不熟悉,请阅读以下章节以进一步了解 Crawlab。

什么是 Crawlab?

Crawlab 是强大的 网络爬虫管理平台(WCMP),它能够运行多种编程语言(包括 Python、Go、Node.js、Java、C#)或爬虫框架(包括 Scrapy、Colly、Selenium、Puppeteer)开发的网路爬虫。它能够用来运行、管理和监控网络爬虫,特别是对可溯性、可扩展性以及稳定性要求较高的生产环境。

背景和发展历史

Crawlab 项目自 2019 年 3 月开始一直在不断迭代,并且经历了多次大的更新。它最初是用来解决调度和运行大量爬虫时出现的管理问题。随着大量的优化和新功能加入,Crawlab 越来越受到开发者社区的欢迎,尤其是网路爬虫工程师。

更新日志open in new window

谁适合使用 Crawlab?

  • 网路爬虫工程师. 通过集成爬虫程序到 Crawlab,您可以聚焦于爬虫的核心解析逻辑,从而避免浪费过多时间在开发通用模块上,例如任务队列、存储、日志、消息通知等。
  • 运维工程师. Crawlab 对于运维工程师来说最大的好处是部署便利(对于爬虫程序和 Crawlab 本身)。Crawlab 支持 Docker 或 Kubernetes 一键安装。
  • 数据分析师. 数据分析师如果能写代码(例如 Python),则可以开发爬虫程序(例如 Scrapy)然后上传到 Crawlab,然后就可以把所有脏活累活交给 Crawlab,它能够自动帮您抓取数据。
  • 其他. 准确的说,任何人都能够享受 Crawlab 自动化带来的便利。虽然 Crawlab 尤其擅长执行网络爬虫任务,但它不仅限于此,它能够被用来运行其他类型的任务,例如数据处理和自动化。

主要功能

类别功能描述
节点节点管理注册、管理、监控分布式系统中多个节点
爬虫爬虫部署自动部署爬虫到多节点,自动同步脚本或程序等爬虫文件
爬虫代码修改实时在线更新和编辑脚本代码
爬虫统计数据爬虫运行统计数据,例如平均运行时长以及结果数量
框架集成基础爬虫框架,例如 Scrapy
数据存储集成自动保存结果数据到数据库,不需要额外配置
Git 集成通过内置或外部远程 Git 仓库进行版本管理
任务任务调度派发调度爬虫任务到分布式系统中多个节点
任务日志自动保存任务日志,可以在前端 UI 界面被查看
任务数据统计可视化展示任务数据,例如任务结果数量和运行时长
用户用户管理创建、更新、删除用户账号
其他依赖管理搜索和安装依赖,例如 Python、Node.js 的依赖包
消息通知当任务触发或结束时,自动发送邮件或移动端消息通知
- + diff --git a/docs/.vuepress/dist/zh/guide/installation/direct-deploy.html b/docs/.vuepress/dist/zh/guide/installation/direct-deploy.html index 18f57bc..a247397 100644 --- a/docs/.vuepress/dist/zh/guide/installation/direct-deploy.html +++ b/docs/.vuepress/dist/zh/guide/installation/direct-deploy.html @@ -38,7 +38,7 @@ } - +

直接部署


直接部署

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/guide/installation/docker.html b/docs/.vuepress/dist/zh/guide/installation/docker.html index dd2d5cc..8214247 100644 --- a/docs/.vuepress/dist/zh/guide/installation/docker.html +++ b/docs/.vuepress/dist/zh/guide/installation/docker.html @@ -5,7 +5,7 @@ - - +

安装: Docker


安装: Docker

Docker 是安装部署 Crawlab 最便捷的方式。如果您不熟悉 Docker,您可以参考 Docker 官网open in new window 并将其安装在本地。在进行任何操作前,请先保证您已安装好了 Docker。

主流程

Docker 部署有多种模式,不过其主流程是相似的。

  1. 安装 Dockeropen in new windowDocker-Composeopen in new window
  2. 拉取 Crawlab Docker 镜像(如果没有 MongoDB,也需要拉取)
  3. 创建 docker-compose.yml 并进行配置
  4. 启动 Docker 容器

注意

在接下来的指南中,我们假设您已经安装了 Docker 和 Docker-Compose,并已经拉取了相应的 Docker 镜像。

单节点部署

单节点部署(SND)快速开始 中的配置类似,它通常用作演示或少量爬虫管理。在 SND 中,所有 Docker 容器 ( 包括 Crawlab 和 MongoDB) 都在单独一台机器上,即主节点(如上图)。

创建 docker-compose.yml 并输入如下内容。

version: '3.3'
@@ -159,6 +159,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/installation/index.html b/docs/.vuepress/dist/zh/guide/installation/index.html index bcd772e..e27011c 100644 --- a/docs/.vuepress/dist/zh/guide/installation/index.html +++ b/docs/.vuepress/dist/zh/guide/installation/index.html @@ -38,7 +38,7 @@ } - +

安装


安装

安装 Crawlab 的方式有多种,您可以参考下面的汇总表,选择最适合您的方式。

安装方式推荐环境推荐用户
Docker演示 / 生产 (节点数不大于 10)
  1. 小型集群需求
  2. 熟悉 Docker
  3. 要求最少量的维护
Kubernetes (待更新)生产 (节点数大于 10)
  1. 中大型集群需求
  2. 可扩展性是重要因素
  3. 熟悉 Kubernetes 或编排技术
  4. 拥有专业的运维资源
直接部署 (待更新)演示 / 实验
  1. 自定义开发需求
  2. 熟悉 Vue.js 和 Go
  3. 愿意操作源码
- + diff --git a/docs/.vuepress/dist/zh/guide/installation/kubernetes.html b/docs/.vuepress/dist/zh/guide/installation/kubernetes.html index 9cfa8c8..ed55804 100644 --- a/docs/.vuepress/dist/zh/guide/installation/kubernetes.html +++ b/docs/.vuepress/dist/zh/guide/installation/kubernetes.html @@ -38,7 +38,7 @@ } - +

Kubernetes


Kubernetes

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/guide/monitoring/index.html b/docs/.vuepress/dist/zh/guide/monitoring/index.html index a2d9818..4c38c44 100644 --- a/docs/.vuepress/dist/zh/guide/monitoring/index.html +++ b/docs/.vuepress/dist/zh/guide/monitoring/index.html @@ -38,7 +38,7 @@ } - +

监控指标


监控指标

注意

该功能仅适用于 Crawlab 专业版open in new window

Crawlab 专业版open in new window 支持性能监控,这意味着您可以使用 Crawlab 专业版监控节点的性能。

性能指标概览

  1. 进入 指标 页面
    metrics-menu
  2. 您可以看到所有节点的性能指标快照
    metrics-overview

性能指标详情

  1. 通过点击 指标 页面中的 查看 按钮进入 指标详情 页面
    view-button
  2. 您可以看到所选节点的性能指标
    metrics-detail
  3. 您可以通过选择 指标来源 下拉菜单来切换指标来源
    metrics-source
  4. 您可以通过选择 时间范围 下拉菜单来选择时间范围
    time-range
    时间单位
    time-unit
  5. 您可以通过勾选左侧面板中的指标来显示/隐藏右侧面板中的指标
    metrics-panel
- + diff --git a/docs/.vuepress/dist/zh/guide/node/index.html b/docs/.vuepress/dist/zh/guide/node/index.html index 993ac35..d610df6 100644 --- a/docs/.vuepress/dist/zh/guide/node/index.html +++ b/docs/.vuepress/dist/zh/guide/node/index.html @@ -38,7 +38,7 @@ } - +

节点


节点

节点其实就是一个 Crawlab 实例,用于运行 任务 以及提供其他功能。基本上您可以将一个节点当作一台服务器。

有两类节点,它们分别提供不同的功能。

  1. 主节点
  2. 工作节点

注意

您当然可以在一台服务器上搭建多个 Crawlab 实例(节点)。但那样做是 不推荐的,因为服务器上运行单实例(节点)通常已经足够了。

主节点

主节点是 Crawlab 中分布式系统的中控模块,相当于人体中的大脑。主节点分配 任务工作节点 或它自己,并且管理它们。同时,主节点部署和分发 爬虫 文件至其他节点。另外,它还为前端应用提供 API,并处理节点间的通信。

注意

在 Crawlab 中只有 一个 主节点。

工作节点

工作节点是一个 Crawlab 实例,专门用于运行抓取 任务。通常,一个节点或服务器受限于计算能力和资源(包括 CPU、内存、网络 IO)。因此,我们可以通过调整工作节点的数量来对数据采集吞吐量进行扩容,以提高分布式系统的整体数据抓取能力。

提示

在 Crawlab 中可以不用工作节点(单节点部署 SND),或有多个工作节点(多节点部署 MND)。

网络拓扑

查看节点状态

Nodes 页面,您可以查看节点状态,看看在线与否。

启用/禁用

您可以启用或禁用节点来控制它是否能运行爬虫任务。您可以在 Nodes 页面或详情页中点击 Enabled 属性中的切换按钮来控制。

设置最大执行器数量

一个节点能够在同一个时间运行多个任务。并发任务数量由节点的 Max Runners 控制,它能够在节点详情页里配置。

设置基础信息

诸如节点名称、IP 地址、MAC 地址等基础信息都可以在节点详情页中设置。

添加节点

您可以参考 Docker 安装中多节点部署章节的 搭建工作节点 来添加新节点。

- + diff --git a/docs/.vuepress/dist/zh/guide/notifications/index.html b/docs/.vuepress/dist/zh/guide/notifications/index.html index a84ce15..5e76e39 100644 --- a/docs/.vuepress/dist/zh/guide/notifications/index.html +++ b/docs/.vuepress/dist/zh/guide/notifications/index.html @@ -38,7 +38,7 @@ } - +

消息通知


消息通知

注意

该功能仅适用于 Crawlab 专业版open in new window

Crawlab 允许用户接收邮件或手机消息通知。

邮件

  1. 导航到 消息通知 页面
    notifications-menu.png
  2. 点击 邮件 类型的消息通知配置
  3. 填写配置表单
    email-config.png
  4. 点击 保存 按钮

SMTP 配置:

  • SMTP Server: SMTP 服务器地址
  • SMTP Port: SMTP 服务器端口
  • SMTP User: SMTP 服务器用户名
  • SMTP Password: SMTP 服务器密码
  • Sender Email: SMTP 服务器发送者邮箱
  • Sender Identity: SMTP 服务器发送者身份
  • To: 接收者邮箱
  • CC: 抄送邮箱

移动端

  1. 导航到 消息通知 页面
    notifications-menu.png
  2. 点击 移动端 类型的消息通知配置
  3. 填写配置表单
    mobile-config.png
  4. 点击 保存 按钮

提示

请参考相关文档了解如何获取 webhook token。

触发

  1. 导航到 消息通知 页面
    notifications-menu.png
  2. 点击 触发 标签
  3. 选择需要触发的事件类型

模板

  1. 导航到 消息通知 页面
    notifications-menu.png
  2. 点击任意类型的消息通知配置
  3. 点击 模板 标签
    template.png

提示

如果想了解模板语法和变量,请参考 template-parseropen in new window

- + diff --git a/docs/.vuepress/dist/zh/guide/permissions/index.html b/docs/.vuepress/dist/zh/guide/permissions/index.html index 417dcaa..8dd0655 100644 --- a/docs/.vuepress/dist/zh/guide/permissions/index.html +++ b/docs/.vuepress/dist/zh/guide/permissions/index.html @@ -38,7 +38,7 @@ } - +

权限管理


权限管理

注意

该功能仅适用于 Crawlab 专业版open in new window

Crawlab 专业版open in new window 支持基于 RBACopen in new window 的权限管理,这意味着您可以使用 Crawlab 专业版通过 角色 来管理您的 用户权限

权限

Crawlab 专业版中的权限是用户访问控制的基本单位。

权限类型

Crawlab 专业版中的权限分为以下几种类型:

  • 操作权限:用户可以执行的操作,例如 查看编辑删除
  • 页面权限:用户可以访问的页面,例如 爬虫管理任务管理
  • 数据权限:用户可以访问的数据,例如 某用户自己的 爬虫任务

权限字段

Crawlab 专业版中的权限包含以下字段:

  • 类型:权限类型,例如 操作权限页面权限数据权限
  • 目标:作用于目标的权限的正则表达式,例如 ^/spider$^/task$
  • 允许: 允许范围的正则表达式
  • 拒绝: 拒绝范围的正则表达式

创建权限

  1. 点击侧边栏的 权限管理,进入权限管理页面
    permissions-menu
  2. 点击 新建权限 按钮,进入创建权限页面
    permissions-create
  3. 填写权限信息,点击 确认 按钮,即可创建权限
    permissions-create-form

删除权限

  1. 点击侧边栏的 权限管理,进入权限管理页面
    permissions-menu
  2. 点击 删除 按钮,即可删除权限
    permissions-delete

角色

Crawlab 专业版中的角色可以被管理用户定义。角色是权限的集合,用户可以通过角色来管理权限。

创建角色

  1. 点击侧边栏的 角色管理,进入角色管理页面
    roles-menu
  2. 点击 新建角色 按钮,进入创建角色页面
    roles-create
  3. 填写角色信息,点击 确认 按钮,即可创建角色
    roles-create-form

删除角色

  1. 点击侧边栏的 角色管理,进入角色管理页面
    roles-menu
  2. 点击 删除 按钮,即可删除角色
    roles-delete

角色权限关联

  1. 点击侧边栏的 角色管理,进入角色管理页面
    roles-menu
  2. 点击 关联权限 按钮
    view-permissions-button
  3. 在弹出的对话框中,勾选需要关联的权限,点击 确认 按钮,即可关联或取消关联权限
    roles-permissions

角色用户关联

  1. 点击侧边栏的 角色管理,进入角色管理页面
    roles-menu
  2. 点击 关联用户 按钮
    view-users-button
  3. 在弹出的对话框中,勾选需要关联的用户,点击 确认 按钮,即可关联或取消关联用户
    roles-users
- + diff --git a/docs/.vuepress/dist/zh/guide/plugin/index.html b/docs/.vuepress/dist/zh/guide/plugin/index.html index 65ff90e..35798a2 100644 --- a/docs/.vuepress/dist/zh/guide/plugin/index.html +++ b/docs/.vuepress/dist/zh/guide/plugin/index.html @@ -38,7 +38,7 @@ } - +

插件


插件

插件 是可以延伸既定功能的扩展。在 Crawlab 中,用户可以使用 插件框架 来自定义爬虫管理平台。

为什么用插件

为什么我们不直接捣鼓一下 Crawlab 源代码来满足自定义需求?其原因是需要考虑 可维护性。当您修改 Crawlab 的核心模块时,可能会产生项目维护方面的风险,因为一旦将来有版本升级,您的自定义功能很可能会遭到破坏。

一个良好设计的插件很少会跟 Crawlab 紧耦合,因此 Crawlab 的版本升级更新将不容易对插件造成严重影响。插件是可插拔的,并且能很容易被安装或卸载。

插件框架

插件框架 是内置在 Crawlab 里的,它用于管理官方和第三方插件。Crawlab 用户能够基于 Crawlab 插件框架 (CPF) 来开发插件。

官方插件

Crawlab 开发团队open in new window 开发了一些公开的官方插件,并在提供长期维护。官方 Crawlab 插件在 Crawlab 开发团队 GitHub 仓库open in new window,每一个仓库都带有前缀 plugin-

名称描述仓库链接
public-notification发送消息提醒,例如邮件通知、手机推送链接open in new window
public-dependency安装依赖并管理依赖和运行环境链接open in new window
public-spider-assistant提供高级网络爬虫功能,包括框架支持,例如 Scrapy 等链接open in new window

安装插件

提示

插件安装完成之后,您需要在浏览器中刷新页面来显示插件的 UI 组件。

在 Crawlab 中有多种安装插件的方式。

通过名称安装

您可以通过在 安装插件 对话框中输入插件名称来安装 官方插件

  1. 导航至 插件 页面
  2. 选择 公共
  3. 在您想要安装的插件上点击 安装 按钮

通过 Git 安装

如果您知道 Crawlab 插件的 Git URL,您可以通过它来安装插件。

  1. 导航至 插件 页面
  2. 选择 Git
  3. 安装 URL 字段中输入插件 URL
  4. 点击 确定.

通过本地安装

注意

这个方式仅适合用源码开发 Crawlab 时使用。

  1. 导航至 插件 页面
  2. 选择 本地
  3. 安装路径 字段中输入插件 URL
  4. 点击 确定.

安装源

注意

安装源 仅对官方插件生效。

默认的官方插件安装源是 GitHub。但是 GitHub 不总是最佳安装源。例如,如果您所在地是中国大陆,连接到 GitHub 可能会有些慢。这时您可以选择 安装源Gitee 来大幅加速官方插件的安装。

卸载插件

您可以在 插件 页面里点击右侧的 删除 按钮来卸载插件。

启动/停止

您可以在 插件 页面里点击右侧的 启动停止 按钮来启动或停止插件。

- + diff --git a/docs/.vuepress/dist/zh/guide/plugin/plugin-dependency.html b/docs/.vuepress/dist/zh/guide/plugin/plugin-dependency.html index d066e39..3078ca9 100644 --- a/docs/.vuepress/dist/zh/guide/plugin/plugin-dependency.html +++ b/docs/.vuepress/dist/zh/guide/plugin/plugin-dependency.html @@ -38,7 +38,7 @@ } - +

依赖管理插件 (plugin-dependency)


依赖管理插件 (plugin-dependency)

plugin-dependencyopen in new window 是 Crawlab 中能够管理依赖的插件。例如,您的 Python 爬虫可能会需要 Crawlab 中预装插件之外的插件,例如 selenium 或 sqlalchemy。利用 plugin-dependency,您可以在 Crawlab 界面中轻松安装和管理依赖和库。

可用的依赖框架

  • Python
  • Node.js

搜索安装依赖

您可以在 Crawlab 界面中利用 plugin-dependency 搜索并安装依赖,跟在受欢迎的 IDEs 中一样,例如 JetBrains IDEA 和 VS Code。

  1. 导航值依赖框架页面,例如 Python
  2. 点击 可安装 按钮
  3. 在上左侧的搜索栏中输入查找关键词
  4. 点击搜索图标按钮
  5. 点击右侧的 安装 按钮,安装您要安装的依赖

卸载依赖

我们也可以卸载依赖。

  1. 导航值依赖框架页面,例如 Python
  2. 点击 已安装 按钮
  3. 在上左侧的搜索栏中输入查找关键词
  4. 点击搜索图标按钮
  5. 点击右侧的 卸载 按钮,安装您要卸载的依赖

查看任务

您可能想查看您的安装或卸载是否成功,这可以通过如下步骤实现。

  1. 导航值插件框架页面,例如 Python
  2. 点击 任务 按钮
  3. 您可以通过点击 日志 按钮来查看每个任务的日志
- + diff --git a/docs/.vuepress/dist/zh/guide/plugin/plugin-notification.html b/docs/.vuepress/dist/zh/guide/plugin/plugin-notification.html index 5665247..a15da27 100644 --- a/docs/.vuepress/dist/zh/guide/plugin/plugin-notification.html +++ b/docs/.vuepress/dist/zh/guide/plugin/plugin-notification.html @@ -38,7 +38,7 @@ } - +

消息通知插件 (plugin-notification)


消息通知插件 (plugin-notification)

plugin-notificationopen in new window 是一个 Crawlab 插件,它允许用户用电子邮箱或移动应用 (例如,微信、钉钉) 收发 Crawlab 消息通知。

消息通知类别

plugin-notification 有 2 种消息通知类别:

  • 邮箱: 通过电子邮箱发送消息通知
  • 移动端: 通过移动端 webhooks 发送消息通知

触发

plugin-notification 允许用户配置触发来控制发送消息通知的时机。

您可以通过如下步骤来配置触发:

  1. 导航至 消息通知 页面
  2. 点击名称或左侧 查看 按钮导航至消息通知详情页面
  3. 点击 触发 标签
  4. 选择发送消息通知的触发

模版

plugin-notification 允许用户自定义消息通知内容。

您可以通过如下步骤来自定义消息通知内容:

  1. 导航至 消息通知 页面
  2. 点击名称或左侧 查看 按钮导航至消息通知详情页面
  3. 点击 模版 标签
  4. 编辑模版
- + diff --git a/docs/.vuepress/dist/zh/guide/plugin/plugin-spider-assistant.html b/docs/.vuepress/dist/zh/guide/plugin/plugin-spider-assistant.html index 2c17f48..1e8aa64 100644 --- a/docs/.vuepress/dist/zh/guide/plugin/plugin-spider-assistant.html +++ b/docs/.vuepress/dist/zh/guide/plugin/plugin-spider-assistant.html @@ -38,7 +38,7 @@ } - +

爬虫助手插件 (plugin-spider-assistant)


爬虫助手插件 (plugin-spider-assistant)

plugin-spider-assistantopen in new window 是一个 Crawlab 插件,提供爬虫管理辅助支持。它允许用户查看并管理各种爬虫框架。

爬虫框架

名称语言查看管理
Scrapyopen in new windowPython
Collyopen in new windowGo
WebMagicopen in new windowJava
DotnetSpideropen in new windowC#

如何使用

  1. 导航至爬虫详情页
  2. 点击 助手 按钮
  3. 您现在应该能看到检测到的爬虫框架信息
- + diff --git a/docs/.vuepress/dist/zh/guide/project/index.html b/docs/.vuepress/dist/zh/guide/project/index.html index eef0c6d..5d98394 100644 --- a/docs/.vuepress/dist/zh/guide/project/index.html +++ b/docs/.vuepress/dist/zh/guide/project/index.html @@ -38,7 +38,7 @@ } - +

项目


项目

项目是通常是一组紧密相关或抓取同类网站数据的 爬虫。因此,您可以将项目作为给爬虫分组的方式。通过这种方式,爬虫能够被更好的管理。

项目跟 爬虫 是一对多的关系。

关联爬虫

您可以将一个爬虫与一个项目进行关联,通过

  1. 在爬虫详情页 项目 字段中选择项目,或
  2. 在创建新爬虫对话框中的 项目 字段中选择项目

查看爬虫

在项目详情页中导航到 爬虫 标签。

- + diff --git a/docs/.vuepress/dist/zh/guide/quick-start.html b/docs/.vuepress/dist/zh/guide/quick-start.html index f8c072a..e9a0aff 100644 --- a/docs/.vuepress/dist/zh/guide/quick-start.html +++ b/docs/.vuepress/dist/zh/guide/quick-start.html @@ -38,7 +38,7 @@ } - +

快速开始


快速开始

最快安装 Crawlab 的方式是 Docker。如果您对 Docker 不熟悉,您可以参考 Docker 官网open in new window ,并将其安装在您本地.

拉取镜像

保证您已经安装好 Docker,并能够拉取 Crawlab 和 MongoDB 的镜像。

docker pull crawlabteam/crawlab
@@ -70,6 +70,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/schedule/index.html b/docs/.vuepress/dist/zh/guide/schedule/index.html index 1062ed6..e23ab9b 100644 --- a/docs/.vuepress/dist/zh/guide/schedule/index.html +++ b/docs/.vuepress/dist/zh/guide/schedule/index.html @@ -38,7 +38,7 @@ } - +

定时任务


定时任务

很多时候,我们会需要周期性的运行 爬虫任务,而您需要的是定时任务。

在 Crawlab 中,定时任务 这个概念跟 Linux 中的 crontabopen in new window 类似。它是一个长期存在的作业,能够周期性的运行爬虫任务。

提示

如果您希望配置一个每天/周/月自动运行抓取任务的网络爬虫,您应该设置一个 定时任务。定时任务是自动化的首选,尤其是对于增量抓取的爬虫。

创建定时任务

  1. 导航至 定时任务列表 页面
  2. 点击左上方的 新建定时任务 按钮
  3. 输入基础信息,包括 名称Cron 表达式open in new window爬虫
  4. 点击 确认

创建好的定时任务默认是启用的,它应该能够在 cron 表达式对应的时刻触发 任务

提示

您可以通过设置 Cron 表达式* * * * *(每分钟运行一次)来调试定时任务模块是否正常工作,即查看每分钟开始时是否有任务触发。

启用/禁用

您可以启用或禁用节点来控制它是否能运行爬虫任务。您可以在 定时任务列表 页面或详情页中点击 是否启用 属性中的切换按钮来控制。

Cron 表达式

Cron 表达式 描述周期性的简单标准格式。它跟 Linux 的 crontab 是同一格式。

*    *    *   *    *  Command_to_execute
@@ -63,6 +63,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/guide/spider/file-editor.html b/docs/.vuepress/dist/zh/guide/spider/file-editor.html index a7abc9d..ed9f8b1 100644 --- a/docs/.vuepress/dist/zh/guide/spider/file-editor.html +++ b/docs/.vuepress/dist/zh/guide/spider/file-editor.html @@ -38,7 +38,7 @@ } - +

文件编辑


文件编辑

Crawlab 允许用户在浏览器中编辑文件。这对于编辑爬虫中的 settings.pyitems.py 文件非常有用。

打开文件

  1. 导航到爬虫详情页面的 Files 标签。
    files-tab
  2. 双击要编辑的文件。
    files-sidebar
  3. 文件应该在编辑器中打开。
    file-editor

编辑文件

  1. 修改文件。

保存文件

  1. Ctrl + S 或点击导航栏中的 保存 按钮保存文件。
    save-btn

移动文件

  1. 拖放文件到要移动到的文件夹。

重命名文件

  1. 右键单击文件并选择 重命名
    rename

复制文件

  1. 右键单击文件并选择 复制
    duplicate

删除文件

  1. 点击右键菜单中的 删除
    delete-file
- + diff --git a/docs/.vuepress/dist/zh/guide/spider/git.html b/docs/.vuepress/dist/zh/guide/spider/git.html index db89953..e684e0e 100644 --- a/docs/.vuepress/dist/zh/guide/spider/git.html +++ b/docs/.vuepress/dist/zh/guide/spider/git.html @@ -38,7 +38,7 @@ } - +

Git 集成


Git 集成

Crawlab 允许用户设置 Git 仓库以便于爬虫的版本控制。这对于团队协作非常有用。

设置 Git 仓库

  1. 导航到爬虫详情页面的 Git 标签。
  2. 输入 Git 仓库的 URL,系统会检测出是否为 HTTPS 或 SSH。
  3. 输入 Git 仓库的用户名和密码,或者 SSH 密钥。
  4. 点击 保存 按钮。

切换分支

  1. 点击 签出 按钮。
  2. 选择要签出的分支。
  3. 点击 确认 按钮。

拉取代码

  1. 点击 拉取 按钮并确认。
  2. 系统会从远端拉取代码。

设置自动拉取代码

  1. 勾选 自动拉取 选项。
  2. 选择拉取代码的时间间隔。
  3. 对于勾选上 自动拉取 选项的爬虫,系统会每过 1 分钟自动拉取代码。

提交变更

  1. 选择 变更 标签。
  2. 勾选变更的文件。
  3. 点击 提交 按钮并确认。
- + diff --git a/docs/.vuepress/dist/zh/guide/spider/index.html b/docs/.vuepress/dist/zh/guide/spider/index.html index c213e46..74bf374 100644 --- a/docs/.vuepress/dist/zh/guide/spider/index.html +++ b/docs/.vuepress/dist/zh/guide/spider/index.html @@ -38,7 +38,7 @@ } - +

爬虫


爬虫

在 Crawlab 中,爬虫是网络爬虫程序的基本单位。您可以将其看作一个爬虫软件项目,它由代码文件组成,例如 Scrapy 项目。请注意,这里提到的 项目 与 Crawlab 中的基础概念 项目 是不同的。

注意

爬虫 这个概念在 Crawlab 非常重要,因此我们强烈推荐您仔细阅读这一章节。

典型流程

以下是用户在 Crawlab 操作爬虫的典型流程。

创建爬虫

  1. 导航到 爬虫 页面,再点击左上方的 新建爬虫 按钮
  2. 输入 名称执行命令 等相关信息
  3. 点击 确认

其中,执行命令 是爬虫运行时的基础命令,例如 scrapy crawl myspider,相当于在运行爬虫时执行的 bash/shell 命令。

上传爬虫

有几种上传爬虫文件的方式。

上传目录

  1. 导航到爬虫详情页
  2. 点击 文件 标签
  3. 点击导航条上的 上传文件 按钮
  4. 选择 目录
  5. 点击 点击选择目录上传
  6. 选择爬虫文件所在目录
  7. 点击 确认

上传文件

  1. 导航到爬虫详情页
  2. 点击 文件 标签
  3. 点击导航条上的 上传文件 按钮
  4. 选择 文件
  5. 拖拽爬虫文件到放置区,或直接点击放置区并选择爬虫文件
  6. 点击 确认

上传文件 (拖拽)

  1. 导航到爬虫详情页
  2. 点击 文件 标签
  3. 拖拽爬虫文件或目录到左侧导航栏的目录里

运行爬虫

您可以根据以下步骤来运行爬虫

  1. 如果您在爬虫详情页,点击导航条上名为 运行 的播放按钮
  2. 如果您在 爬虫列表 页面,点击右侧名为 运行 的播放按钮
  3. 选择合适的爬虫运行设置
  4. 点击 确认

之类是爬虫运行设置的解释。

  • 执行命令: 将被实际运行的 cmd/bash/shell 基础命令
  • 执行参数: 被传入 执行命令 的参数
  • 模式: 运行模式,默认为 随机节点.
  • 优先级: 任务优先级,默认为 5

实体关系

- + diff --git a/docs/.vuepress/dist/zh/guide/spider/integration.html b/docs/.vuepress/dist/zh/guide/spider/integration.html index bf6e7c3..11d9802 100644 --- a/docs/.vuepress/dist/zh/guide/spider/integration.html +++ b/docs/.vuepress/dist/zh/guide/spider/integration.html @@ -38,7 +38,7 @@ } - +

数据集成


数据集成

您可以通过 Crawlab SDK 集成您的爬虫数据。这允许您在 Crawlab 上以可视化的方式查看爬取的结果。

Crawlab SDK 支持各种爬虫框架的集成,包括 Scrapy,以及各种编程语言,包括 Python,Node.js,Go。

NOTE

默认情况下,Crawlab SDK 已经安装在 Crawlab 的基础镜像中。如果您没有使用 Crawlab Docker 镜像,您也可以手动安装。

Scrapy

  1. 确保您已经在 Crawlab 上创建了 Scrapy 爬虫。
  2. settings.py 文件中,将 crawlab.CrawlabPipeline 添加到 Item_PIPELINES 中。
    ITEM_PIPELINES = {
    @@ -64,6 +64,6 @@
       
     
- + diff --git a/docs/.vuepress/dist/zh/guide/spider/selenium.html b/docs/.vuepress/dist/zh/guide/spider/selenium.html new file mode 100644 index 0000000..d227b84 --- /dev/null +++ b/docs/.vuepress/dist/zh/guide/spider/selenium.html @@ -0,0 +1,99 @@ + + + + + + + + Selenium 爬虫集成 + + + + + + +

Selenium 爬虫集成


Selenium 爬虫集成

Selenium 简介

Seleniumopen in new window 是一个用于 Web 应用程序测试的工具,但也可以用于编写 Web 爬虫。与传统的 HTTP 请求库(如 Requests)不同,Selenium 允许您模拟浏览器行为,实际上自动化浏览器来获取数据。这对于那些需要 JavaScript 渲染的动态网页非常有用。

在 Crawlab 中集成 Selenium 爬虫

下面,我们将介绍如何在 Crawlab 中集成 Selenium 爬虫,并在 Crawlab 前端界面中显示抓取结果。我们将以36氪为例进行演示。

创建爬虫

在 Crawlab 爬虫列表中,创建名为 "36kr" 的爬虫,执行命令为 python main.py

编辑爬虫文件

创建并打开 main.py,输入以下内容。

from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from crawlab import save_item
+
+# create web driver with chrome
+chrome_options = Options()
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+browser = webdriver.Chrome(options=chrome_options)
+
+# navigate to news list page
+browser.get('https://36kr.com/information/web_news/')
+
+# get article items
+items = browser.find_elements(by=By.CSS_SELECTOR, value='.information-flow-list > .information-flow-item')
+
+# iterate items
+for item in items:
+    # fields
+    el_title = item.find_element(by=By.CSS_SELECTOR, value='.article-item-title')
+    title = el_title.text
+    url = el_title.get_attribute('href')
+    topic = item.find_element(by=By.CSS_SELECTOR, value='.kr-flow-bar-motif > a').text
+    description = item.find_element(by=By.CSS_SELECTOR, value='.article-item-description').text
+    try:
+        pic_url = item.find_element(by=By.CSS_SELECTOR, value='.article-item-pic > img').get_attribute('src')
+    except:
+        pic_url = None
+
+    # save to crawlab
+    save_item({
+        'title': title,
+        'url': url,
+        'topic': topic,
+        'description': description,
+        'pic_url': pic_url,
+    })
+

其中,我们定义了 Chrome 浏览器的 chrome_options,并需要加入以下参数。

注意

这很重要,否则在 Crawlab 中将无法正常运行!

chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+

最后,我们用 Crawlab SDK 中的 save_item 方法保存爬虫抓取获得的结果。

运行爬虫

在 "36kr" 爬虫中运行爬虫,即可以获取得到抓取结果。

+ + + diff --git a/docs/.vuepress/dist/zh/guide/task/index.html b/docs/.vuepress/dist/zh/guide/task/index.html index 3ee56d0..3fb75c3 100644 --- a/docs/.vuepress/dist/zh/guide/task/index.html +++ b/docs/.vuepress/dist/zh/guide/task/index.html @@ -38,7 +38,7 @@ } - +

任务


任务

任务是 爬虫 触发的进程,能够抓取网站数据、进行特殊操作、提供其他一些功能。它是运行爬虫进程的基本单位。

在 Crawlab 中,您不仅可以一键运行任务,还可以可视化的查看统计数据、实时日志、已抓取数据等任务信息。此外,您还可以设置 Priority 来决定任务的执行顺序。

运行任务

您可以 通过爬虫运行任务,或执行下面的步骤。

  1. 导航至 任务列表 页面
  2. 点击左上方的 新建任务 按钮
  3. 选择 爬虫 及其他信息
  4. 点击 确认

重新运行任务

  1. 导航至 任务列表 页面
  2. 点击右侧的 重新运行 按钮

监控任务

Crawlab 提供任务监控功能,让您能够紧密观察抓取结果数据以及爬虫抓取效率。

查看日志

您可以在 Crawlab 中查看实时日志。

  1. 导航至任务详情页
  2. 点击 日志 标签

查看数据

您可以实时查看已抓取数据

  1. 导航至任务详情页
  2. 点击 数据 标签

取消任务

如果任务是 待运行运行中 状态,您可以取消它,通过

  1. 任务列表 页面中点击右侧的 取消,或
  2. 在任务详情页点击导航条上的 取消 按钮
- + diff --git a/docs/.vuepress/dist/zh/guide/token/index.html b/docs/.vuepress/dist/zh/guide/token/index.html index d70549e..9978bf1 100644 --- a/docs/.vuepress/dist/zh/guide/token/index.html +++ b/docs/.vuepress/dist/zh/guide/token/index.html @@ -38,7 +38,7 @@ } - +


- + diff --git a/docs/.vuepress/dist/zh/guide/user/index.html b/docs/.vuepress/dist/zh/guide/user/index.html index ff04843..b14d34f 100644 --- a/docs/.vuepress/dist/zh/guide/user/index.html +++ b/docs/.vuepress/dist/zh/guide/user/index.html @@ -38,7 +38,7 @@ } - +

用户


用户

Crawlab 中的用户等同于大多数管理系统中的用户账户。用户账户能被创建、更新、删除。

管理用户

默认情况下,一个 管理账户 将在 Crawlab 初始化时被创建。以下是它默认用户名密码。

  • 用户名: admin
  • 密码: admin

创建用户

  1. 导航到 用户列表 页面,再点击左上方的 新建用户 按钮
  2. 输入 用户名密码 等相关信息
  3. 点击 确认

更改用户密码

  1. 导航到用户详情页
  2. 点击 更改密码 按钮
  3. 输入新密码并点击确认
- + diff --git a/docs/.vuepress/dist/zh/index.html b/docs/.vuepress/dist/zh/index.html index 0615e8b..d6900a5 100644 --- a/docs/.vuepress/dist/zh/index.html +++ b/docs/.vuepress/dist/zh/index.html @@ -38,7 +38,7 @@ } - +

Hello

Documentation for distributed web crawler management platform

- + diff --git a/docs/.vuepress/dist/zh/migration/index.html b/docs/.vuepress/dist/zh/migration/index.html index 074fc8d..54344f5 100644 --- a/docs/.vuepress/dist/zh/migration/index.html +++ b/docs/.vuepress/dist/zh/migration/index.html @@ -38,7 +38,7 @@ } - +

版本升级迁移


版本升级迁移

Crawlab 从发布至今,有多次大版本更新,由于底层原理变化较大,各个版本不能完全兼容。因此,如果您希望从旧版本迁移到新版本,请参考下列对应的升级迁移指南。

v0.6

Crawlab v0.6 是一个重大版本更新,在稳定性、可扩展性、功能性方面有大量优化提升。如果需要升级,请参考 v0.6 升级指南

v0.5

Crawlab v0.5 是第一个稳定版本,详情请参考 v0.5 文档open in new window

各版本功能对比

功能v0.6v0.5
节点管理
爬虫部署
爬虫代码修改
爬虫统计数据
框架集成
数据存储集成
Git 集成
任务调度
任务日志
任务数据统计
定时任务
用户管理
依赖管理
消息通知
API
CLI
SDK
自定义插件
可配置爬虫❌ (暂时)
- + diff --git a/docs/.vuepress/dist/zh/migration/v0.6.html b/docs/.vuepress/dist/zh/migration/v0.6.html index ff8e2f1..b3e7ad8 100644 --- a/docs/.vuepress/dist/zh/migration/v0.6.html +++ b/docs/.vuepress/dist/zh/migration/v0.6.html @@ -38,7 +38,7 @@ } - +

从旧版本升级到 v0.6.x


从旧版本升级到 v0.6.x

从 v0.5.x 版本

如果您已经部署了 Crawlab v0.5.x 并且已经在生产环境中运行爬虫,您可以根据以下步骤进行升级。

  1. 安装最新 crawlab-sdkopen in new window
  2. 执行如下命令迁移您的爬虫数据至 v0.6.x
    crawlab migrate \
    @@ -59,6 +59,6 @@
       
     
- + diff --git a/docs/.vuepress/dist/zh/principle/architecture/index.html b/docs/.vuepress/dist/zh/principle/architecture/index.html index f19c87d..267b311 100644 --- a/docs/.vuepress/dist/zh/principle/architecture/index.html +++ b/docs/.vuepress/dist/zh/principle/architecture/index.html @@ -38,7 +38,7 @@ } - +

架构


架构

以下是 Crawlab 的架构图。

Architecture

作为分布式系统,Crawlab 由几个模块组成,其中包括 主节点工作节点数据库文件系统 以及 前端。每个模块(目前除了节点)都具有可扩展性, 因此整个分布式系统能够被灵活扩展以支持高性能需求。

例如,爬虫抓取任务被分配到工作节点集群,然后工作节点集群可以根据资源需求的变化来调整节点数量。

- + diff --git a/docs/.vuepress/dist/zh/principle/core-modules/index.html b/docs/.vuepress/dist/zh/principle/core-modules/index.html index 7da74a7..3cd5ec5 100644 --- a/docs/.vuepress/dist/zh/principle/core-modules/index.html +++ b/docs/.vuepress/dist/zh/principle/core-modules/index.html @@ -38,7 +38,7 @@ } - +

核心模块


核心模块

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/principle/core-modules/schedule/index.html b/docs/.vuepress/dist/zh/principle/core-modules/schedule/index.html index 3ff260e..72fea07 100644 --- a/docs/.vuepress/dist/zh/principle/core-modules/schedule/index.html +++ b/docs/.vuepress/dist/zh/principle/core-modules/schedule/index.html @@ -38,7 +38,7 @@ } - +

Schedule


Schedule

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/principle/core-modules/spider/index.html b/docs/.vuepress/dist/zh/principle/core-modules/spider/index.html index bfd52d3..54518fe 100644 --- a/docs/.vuepress/dist/zh/principle/core-modules/spider/index.html +++ b/docs/.vuepress/dist/zh/principle/core-modules/spider/index.html @@ -38,7 +38,7 @@ } - +

Spider


Spider

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/principle/core-modules/task/index.html b/docs/.vuepress/dist/zh/principle/core-modules/task/index.html index c722d60..9067a7c 100644 --- a/docs/.vuepress/dist/zh/principle/core-modules/task/index.html +++ b/docs/.vuepress/dist/zh/principle/core-modules/task/index.html @@ -38,7 +38,7 @@ } - +

Task


Task

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/principle/database/index.html b/docs/.vuepress/dist/zh/principle/database/index.html index 1681b76..f76500b 100644 --- a/docs/.vuepress/dist/zh/principle/database/index.html +++ b/docs/.vuepress/dist/zh/principle/database/index.html @@ -38,7 +38,7 @@ } - +

数据库


数据库

Crawlab 使用开源的高性能 NoSQL 数据库 MongoDB 作为运行数据库。

- + diff --git a/docs/.vuepress/dist/zh/principle/database/mongodb.html b/docs/.vuepress/dist/zh/principle/database/mongodb.html index cd0c4d1..dd7b9f7 100644 --- a/docs/.vuepress/dist/zh/principle/database/mongodb.html +++ b/docs/.vuepress/dist/zh/principle/database/mongodb.html @@ -38,7 +38,7 @@ } - +

MongoDB


MongoDB

MongoDBopen in new window 是最常用的 NoSQL 数据库之一,其无结构 (schemaless) 特性让开发者不用担心由数据字段发生变更而引发的数据库结构迁移,使得开发者快速开发程序应用。

Crawlab 选择使用 MongoDB 的最大原因是灵活性。因为 Crawlab 正在快速开发中,很多时间数据模型与字段需要变更。因此,MongoDB 是最优选择。

- + diff --git a/docs/.vuepress/dist/zh/principle/filesystem/index.html b/docs/.vuepress/dist/zh/principle/filesystem/index.html index 72a170c..f841e60 100644 --- a/docs/.vuepress/dist/zh/principle/filesystem/index.html +++ b/docs/.vuepress/dist/zh/principle/filesystem/index.html @@ -38,7 +38,7 @@ } - +

文件系统


文件系统

Crawlab 使用 SeaweedFS 作为分布式文件系统。

以下是它实现主节点、工作节点文件同步的工作原理。

- + diff --git a/docs/.vuepress/dist/zh/principle/filesystem/seaweedfs.html b/docs/.vuepress/dist/zh/principle/filesystem/seaweedfs.html index 6cae585..ca758cc 100644 --- a/docs/.vuepress/dist/zh/principle/filesystem/seaweedfs.html +++ b/docs/.vuepress/dist/zh/principle/filesystem/seaweedfs.html @@ -38,7 +38,7 @@ } - +

SeaweedFS


SeaweedFS

SeaweedFSopen in new window 是开源分布式文件系统,适合储存小文件。Crawlab 是一个分布式爬虫管理平台,而 SeaweedFS 可以作为节点的间脚本或程序文件同步介质。

更多信息,请查看 GitHub 上的 wikiopen in new window

- + diff --git a/docs/.vuepress/dist/zh/principle/frontend/index.html b/docs/.vuepress/dist/zh/principle/frontend/index.html index 3e59eb2..43b336c 100644 --- a/docs/.vuepress/dist/zh/principle/frontend/index.html +++ b/docs/.vuepress/dist/zh/principle/frontend/index.html @@ -38,7 +38,7 @@ } - +

Frontend


Frontend

🚧 Under construction...

- + diff --git a/docs/.vuepress/dist/zh/principle/index.html b/docs/.vuepress/dist/zh/principle/index.html index 25cdcb6..b29adf2 100644 --- a/docs/.vuepress/dist/zh/principle/index.html +++ b/docs/.vuepress/dist/zh/principle/index.html @@ -38,7 +38,7 @@ } - +

介绍


介绍

作为开发人员,您可能对 Crawlab 是如何工作的感兴趣。本章节将介绍相关的基础知识以及背后的技术。

本章节包含以下主题:

- + diff --git a/docs/.vuepress/dist/zh/principle/node/index.html b/docs/.vuepress/dist/zh/principle/node/index.html index d0a10a4..0c5b1ed 100644 --- a/docs/.vuepress/dist/zh/principle/node/index.html +++ b/docs/.vuepress/dist/zh/principle/node/index.html @@ -38,7 +38,7 @@ } - +

节点


节点

节点是可以管理和执行爬虫程序的实体,是分布式系统资源的基础单位。

有两种节点类型:

如果您希望了解节点间如何通信,请参考节点通信

- + diff --git a/docs/.vuepress/dist/zh/principle/node/master-node.html b/docs/.vuepress/dist/zh/principle/node/master-node.html index 45d49b1..c08a2af 100644 --- a/docs/.vuepress/dist/zh/principle/node/master-node.html +++ b/docs/.vuepress/dist/zh/principle/node/master-node.html @@ -38,7 +38,7 @@ } - +

主节点


主节点

主节点是 Crawlab 分布式系统中的中心部分,主要负责管理工作节点、派发爬虫任务、以及接受发送信息给工作节点。

主节点由很多自模块组成,相比于工作节点更为复杂。您可以参考下图查看主节点的自模块。

- + diff --git a/docs/.vuepress/dist/zh/principle/node/node-communication.html b/docs/.vuepress/dist/zh/principle/node/node-communication.html index 633f99d..de1eb60 100644 --- a/docs/.vuepress/dist/zh/principle/node/node-communication.html +++ b/docs/.vuepress/dist/zh/principle/node/node-communication.html @@ -38,7 +38,7 @@ } - +

节点通信


节点通信

主节点通过高性能 RPC 框架 gRPCopen in new window 与工作节点进行通信。

下图展示了 Crawlab 中节点通信的原理。

- + diff --git a/docs/.vuepress/dist/zh/principle/node/worker-node.html b/docs/.vuepress/dist/zh/principle/node/worker-node.html index ba50910..47725d7 100644 --- a/docs/.vuepress/dist/zh/principle/node/worker-node.html +++ b/docs/.vuepress/dist/zh/principle/node/worker-node.html @@ -38,7 +38,7 @@ } - +

工作节点


工作节点

工作节点的主要负责任务执行,它从主节点接收爬虫任务并执行目标网站抓取爬虫程序。因此,您可以将工作节点看作执行器。它们并不会处理 API 请求、任务派发、以及其他主节点上的功能。

您可以查看下图工作节点自模块。

- + diff --git a/docs/.vuepress/dist/zh/pro/index.html b/docs/.vuepress/dist/zh/pro/index.html index 51c07c2..fcb8d80 100644 --- a/docs/.vuepress/dist/zh/pro/index.html +++ b/docs/.vuepress/dist/zh/pro/index.html @@ -38,7 +38,7 @@ } - +

介绍


介绍

Crawlab 专业版是 Crawlab 的企业级产品,基于 Crawlab 的核心功能,增加了更多的功能,适用于生产环境,具有更好的可扩展性和稳定性。

演示

您可以访问 Demoopen in new window 查看 Crawlab 专业版的功能。

功能对比

功能专业版社区版
爬虫管理
定时任务
任务管理
数据源集成
消息通知
性能监控
权限管理
环境变量
画布(低代码开发)

如何安装

Crawlab 专业版的安装与社区版类似,非常简单。

请参考 安装 Crawlab 专业版 了解更多。

许可证

您需要一个许可证才能使用 Crawlab 专业版。

- + diff --git a/docs/.vuepress/dist/zh/pro/installation.html b/docs/.vuepress/dist/zh/pro/installation.html index 6d7d029..4c20b5e 100644 --- a/docs/.vuepress/dist/zh/pro/installation.html +++ b/docs/.vuepress/dist/zh/pro/installation.html @@ -5,7 +5,7 @@ - - +

安装


安装

Crawlab 专业版通过 Docker 镜像的方式提供,您可以在任何支持 Docker 的环境中安装。

前提条件

提示

如果您不知道如何安装 Docker 环境,请参考 Docker 安装

  • Docker 环境. 请保证您已经安装了 Docker 以及 Docker Compose。
  • 许可证. 您需要一个许可证才能使用 Crawlab 专业版。如果您还没有许可证,请联系我们来进行获取。

主流程

提示

如果您不知道如何获取许可证,请参考 许可证

安装 Crawlab 专业版的主流程跟 Crawlab 开源版 的流程是一样的,只是在安装过程中需要提供许可证 。您可以参考 用 Docker 安装开源版 的安装流程。

  1. 安装 Dockeropen in new windowDocker-Composeopen in new window
  2. 拉取 Crawlab Docker 镜像(如果没有 MongoDB,也需要拉取)
  3. 创建 docker-compose.yml 并进行配置(需要包含 许可证)
  4. 启动 Docker 容器

注意

在接下来的指南中,我们假设您已经安装了 Docker 和 Docker-Compose,并已经拉取了相应的 Docker 镜像。

单节点部署

单节点部署(SND)快速开始 中的配置类似,它通常用作演示或少量爬虫管理。在 SND 中,所有 Docker 容器 ( 包括 Crawlab 和 MongoDB) 都在单独一台机器上,即主节点(如上图)。

创建 docker-compose.yml 并输入如下内容。

version: '3.3'
@@ -164,6 +164,6 @@
   
 
- + diff --git a/docs/.vuepress/dist/zh/pro/license.html b/docs/.vuepress/dist/zh/pro/license.html index 7504782..72ee02e 100644 --- a/docs/.vuepress/dist/zh/pro/license.html +++ b/docs/.vuepress/dist/zh/pro/license.html @@ -38,7 +38,7 @@ } - +

许可证


许可证

Crawlab 专业版是一个商业产品。您需要一个有效的许可证才能使用它。

如何购买

您可以通过以下方式购买许可证:

  1. 在线购买:https://www.crawlab.cn/pricesopen in new window
  2. 联系我们:微信客服账号 tikazyq1
- +