From b51e710e89d3409436002e0d69c83d1c34f4145a Mon Sep 17 00:00:00 2001 From: jiangwei Date: Thu, 1 Aug 2019 22:36:43 +0800 Subject: [PATCH] better --- Query.js | 32 ++++++++++++++++++++++++++++---- README.md | 31 +++++++++++++++++++++++++++++-- config-all.yaml | 4 ---- config.yaml | 4 ---- entrance/info.go | 10 +++++----- entrance/lianjia.go | 19 +++++++++++++------ entrance/lianjia_zufang.go | 34 ++++++++++++++++++++++++++-------- entrance/zhilian.go | 3 +++ main.go | 9 +++++++-- 9 files changed, 111 insertions(+), 35 deletions(-) diff --git a/Query.js b/Query.js index 48c0e5b..15a4858 100644 --- a/Query.js +++ b/Query.js @@ -1,8 +1,7 @@ -// 这里是一些常用的查询语句 +// 这里是分析数据时用的的查询语句 // 房价均价查询语句 - db.lianjia.aggregate([ {'$match': {"address.0": {$exists: true}}}, { @@ -29,7 +28,6 @@ db.lianjia.aggregate([ // 平均薪资查询语句 - db.zhilian.aggregate([ {'$match': {"workingExp.name": "1-3年"}}, { @@ -60,4 +58,30 @@ db.lianjia.aggregate([ { '$sort': {detailCrawlTime: -1} } -], {allowDiskUse: true}); \ No newline at end of file +], {allowDiskUse: true}); + + +// 租房数据 +db.lianjia_zufang.aggregate([ + { + $group: { + _id: "$city", + count: {$sum: 1}, + avg: {$avg: "$price"}, + std: {$stdDevPop: "$price"}, + unitPrice: {$avg: {$divide: ["$price", "$mianji"]}} + } + }, + { + $project: { + unitPrice: 1, // 单位价格 + count: 1, //总数 + avg: 1, //每平米均价 + std: 1, //标准差 + ratio: {$divide: ["$std", "$avg"]} //标准差与均价的比值 + } + }, + { + '$sort': {count: -1} + } +]); \ No newline at end of file diff --git a/README.md b/README.md index 78b0d3f..47bce23 100644 --- a/README.md +++ b/README.md @@ -65,13 +65,40 @@ option支持:lianjia_ershou、zhilian、lianjia_zufang 方便定时脚本记录抓取情况,使用info命令可以输出当前抓取数据量到文件 ``` -getAwayBSG -info +getAwayBSG -info -info_save_to=./numLog.txt ``` +使用-info_save_to参数指定文件保存位置,默认为当前目录的numLog.txt文件中 + 3.help 输出支持的全部命令列表 ``` getAwayBSG -help -``` \ No newline at end of file +``` + + +## 数据分析 + +分析用的MongoDB语句在[Query.js](./Query.js)文件中,使用MongoDB执行即可 + +## 编译 + +编译使用xgo,需要先安装docker + +``` +git clone https://github.com/jiangwei1995910/getAwayBSG + +docker pull karalabe/xgo-latest + +go get github.com/karalabe/xgo + +cd getAwayBSG + +sh ./build.sh +``` + +## 部署 + +如果需要分布式或者多进程抓取,在不同机器或者多个进程中指定相同的MongoDB源即可,程序已经支持分布式多进程抓取了。已抓取的链接和状态会通过MongoDB共享 \ No newline at end of file diff --git a/config-all.yaml b/config-all.yaml index 3ad92f4..2483934 100644 --- a/config-all.yaml +++ b/config-all.yaml @@ -1515,10 +1515,6 @@ zlCityList: url: https://www.zhaopin.com/maoming/ code: 771 pinyin: maoming - - name: 蒙自市 - url: https://www.zhaopin.com/mengzishi/ - code: - pinyin: mengzishi - name: 满洲里 url: https://www.zhaopin.com/manzhouli/ code: 10157 diff --git a/config.yaml b/config.yaml index ff69ce1..e097242 100644 --- a/config.yaml +++ b/config.yaml @@ -1515,10 +1515,6 @@ zlCityList: # url: https://www.zhaopin.com/maoming/ # code: 771 # pinyin: maoming -# - name: 蒙自市 -# url: https://www.zhaopin.com/mengzishi/ -# code: -# pinyin: mengzishi # - name: 满洲里 # url: https://www.zhaopin.com/manzhouli/ # code: 10157 diff --git a/entrance/info.go b/entrance/info.go index dbf2d7b..c237aa3 100644 --- a/entrance/info.go +++ b/entrance/info.go @@ -10,14 +10,14 @@ import ( "time" ) -func Start_info() { +func Start_info(path string) { - fd, _ := os.OpenFile("./numLog.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) + fd, _ := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644) fd_time := time.Now().Format("2006-01-02 15:04:05") fd_content := strings.Join([]string{ - fd_time, ":\n", - getLianjiaErShouFangStatus(), "\n", - getLianJiaZuFangStatus(), "\n", + fd_time, ":", + getLianjiaErShouFangStatus(), " ", + getLianJiaZuFangStatus(), " ", getZhiLianStatus(), "\n", }, "") buf := []byte(fd_content) diff --git a/entrance/lianjia.go b/entrance/lianjia.go index 4916ab5..4ae42da 100644 --- a/entrance/lianjia.go +++ b/entrance/lianjia.go @@ -156,6 +156,7 @@ func crawlDetail() (sucnum int) { c := colly.NewCollector() configInfo := configs.Config() + //设置延时 if configInfo["crawlDelay"] != nil { delay, _ := configInfo["crawlDelay"].(json.Number).Int64() if delay > 0 { @@ -166,6 +167,7 @@ func crawlDetail() (sucnum int) { } } + //设置代理 if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 { var proxyList []string for _, v := range configInfo["proxyList"].([]interface{}) { @@ -181,8 +183,11 @@ func crawlDetail() (sucnum int) { } } + //随机UA extensions.RandomUserAgent(c) + //自动referer extensions.Referer(c) + //设置MongoDB存储状态信息 storage := &cachemongo.Storage{ Database: "colly", URI: configInfo["dburl"].(string) + "/colly", @@ -240,6 +245,7 @@ func crawlDetail() (sucnum int) { odb := client.Database(configInfo["dbDatabase"].(string)) lianjia := odb.Collection(configInfo["dbCollection"].(string)) + //读取出全部需要抓取详情的数据 cur, err := lianjia.Find(ctx, bson.M{"detailCrawlTime": bson.M{"$exists": false}}) if err != nil { @@ -263,12 +269,12 @@ func crawlDetail() (sucnum int) { } func Start_lianjia_ershou() { - listFlag := make(chan int) - detailFlag := make(chan int) + listFlag := make(chan int) //记录列表抓取是否完成 + detailFlag := make(chan int) //记录详情是否抓取完成 go func() { listCrawler() - listFlag <- 1 + listFlag <- 1 //列表抓取完成 }() go func() { @@ -276,15 +282,16 @@ func Start_lianjia_ershou() { for i := 0; i < 1; i = 0 { if crawlDetail() == 0 { zeroNum++ - if zeroNum > 3 { + if zeroNum > 3 { //尝试3次都没有详情需要抓取,结束详情抓取 break } - time.Sleep(300 * time.Second) + time.Sleep(300 * time.Second) //没有详情需要抓取了,等待5分钟再尝试 } } - detailFlag <- 1 + detailFlag <- 1 //详情抓取完成 }() + //详情抓取与列表抓取都完成了,结束主线程 <-listFlag <-detailFlag } diff --git a/entrance/lianjia_zufang.go b/entrance/lianjia_zufang.go index 1a74321..1265adb 100644 --- a/entrance/lianjia_zufang.go +++ b/entrance/lianjia_zufang.go @@ -63,12 +63,13 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) { }) c.OnHTML(".content__list--item", func(element *colly.HTMLElement) { - + var err error var link string var title string var address string var area string var price int + var mianji int element.ForEach(".twoline a", func(i int, element *colly.HTMLElement) { link = "https://" + element.Request.URL.Host + element.Attr("href") title = strings.TrimSpace(element.Text) @@ -82,6 +83,22 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) { } }) + desc := element.ChildText(".content__list--item--des") + desc = strings.ReplaceAll(desc, " ", "") + desc = strings.ReplaceAll(desc, "\n", "") + fmt.Println(desc) + re, _ := regexp.Compile("(\\d+)㎡/") + indexs := re.FindStringIndex(desc) + if len(indexs) == 2 { + + mianji, err = strconv.Atoi(desc[indexs[0] : indexs[1]-4]) + if err != nil { + mianji = 0 + } + } else { + mianji = 0 + } + element.ForEach(".content__list--item-price em", func(i int, element *colly.HTMLElement) { var err error price, err = strconv.Atoi(element.Text) @@ -90,12 +107,12 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) { } }) - fmt.Println(price) - fmt.Println(link) - fmt.Println(title) - fmt.Println(address) - fmt.Println(area) - fmt.Println(cityname) + //fmt.Println(price) + //fmt.Println(link) + //fmt.Println(title) + //fmt.Println(address) + //fmt.Println(area) + //fmt.Println(cityname) fmt.Println("--------------------") client := db.GetClient() @@ -103,13 +120,14 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) { db := client.Database(configInfo["dbDatabase"].(string)) lianjia := db.Collection(configInfo["zufangCollection"].(string)) - _, err := lianjia.InsertOne(ctx, bson.M{ + _, err = lianjia.InsertOne(ctx, bson.M{ "Link": link, "title": title, "address": address, "area": area, "price": price, "city": cityname, + "mianji": mianji, "crawl_time": time.Now(), }) if err != nil { diff --git a/entrance/zhilian.go b/entrance/zhilian.go index a232aa6..00ff3a0 100644 --- a/entrance/zhilian.go +++ b/entrance/zhilian.go @@ -26,6 +26,9 @@ func Start_zhilian() { var total int = 1000 for start := 0; start < total; start += 50 { cityid := cityList[j].(map[string]interface{})["code"] + if cityid == nil { + fmt.Println(cityList[j]) + } icityid, err := cityid.(json.Number).Int64() if err != nil { icityid = 530 diff --git a/main.go b/main.go index 2a5d840..21aeb81 100644 --- a/main.go +++ b/main.go @@ -7,7 +7,7 @@ import ( "getAwayBSG/entrance" ) -// 实际中应该用更好的变量名 +// 申明配置变量 var ( help bool config string @@ -16,6 +16,7 @@ var ( zhilian bool clean bool info bool + infoSaveTo string ) func init() { @@ -26,15 +27,19 @@ func init() { flag.BoolVar(&zhilian, "zhilian", false, "抓取智联招聘数据") flag.BoolVar(&clean, "clean", false, "清理缓存") flag.BoolVar(&info, "info", false, "保存抓取状态") + flag.StringVar(&infoSaveTo, "info_save_to", "./numlog.txt", "输入状态文件保存位置") } func main() { flag.Parse() + //初始化配置信息,同时输出配置信息 if config != "" { configs.SetConfig(config) } fmt.Println(configs.Config()) + + //进入不同入口 if help { flag.Usage() } else if lianjia_ershou { @@ -46,7 +51,7 @@ func main() { } else if clean { entrance.Start_clean() } else if info { - entrance.Start_info() + entrance.Start_info(infoSaveTo) } else { flag.Usage() }