-
Notifications
You must be signed in to change notification settings - Fork 1
/
Crawler.hs
104 lines (75 loc) · 3.5 KB
/
Crawler.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
{-# LANGUAGE OverloadedStrings #-}
module Crawler where
import Network.Wreq
import Control.Lens
import Data.Aeson.Lens (_String, key)
import Data.ByteString.Lazy.Char8 as BS
import Data.List
import GHC.Int as GI
import Data.List.Split
import Data.Encoding.GB18030
import Data.Encoding.UTF8
import Data.Encoding(encodeString,decodeString)
findSubstring :: Eq a => [a] -> [a] -> Maybe Int
findSubstring pat str = Data.List.findIndex (Data.List.isPrefixOf pat) (Data.List.tails str)
{-
fetchLinks :: String -> (String,String) -> [String]
fetchLinks [] _ = []
fetchLinks (x:xs)
-}
{-使用了Data.List的函数-}
fetchTotal1 :: String -> String -> String -> [String]
fetchTotal1 str parstr1 parstr2 = Prelude.filter emptyFilter $ Prelude.map Prelude.head tails
where heads = splitOn parstr1 str
tails = (Prelude.map (\x-> splitOn parstr2 x) heads)
emptyFilter x = Prelude.length x > 0
bookUrl :: String
bookUrl = "http://www.kanshu.com/files/article/html/111220/"
getChapter :: String -> IO String
getChapter url = do
result <- get ("http://www.kanshu.com"++url)
let content = fetchTotal1 (BS.unpack (result^.responseBody)) "<div class=\"yd_text2\">" "<span id=\"avg_link\">"
let decode = decodeString GB18030 (Prelude.head $ Prelude.tail content)
Prelude.putStrLn decode
return "123"
writeFilePath = "chapter.txt"
writeChapter :: String -> IO ()
writeChapter s = Prelude.appendFile writeFilePath (s++"\n")
tmain :: IO ()
tmain = do
content <- Prelude.readFile "title.html"
let links = fetchTotal1 content "<li>" "</li>"
let urls = Prelude.map (\x -> Prelude.head $ fetchTotal1 x "<a href=\"" "\"") links
mapM Prelude.putStrLn urls
mapM writeChapter urls
return ()
{-自己手写
fetchByHead :: String -> String -> Int -> [Maybe Int]
fetchByHead [] _ _ = []
fetchByHead str parstr startIndex = case findSubstring parstr str of
Just strIndex -> [Just (strIndex+startIndex)] ++ fetchByHead (Prelude.drop (strIndex + lengthStrPar) str) parstr (strIndex+startIndex)
Nothing -> []
where lengthStrPar = Data.List.length parstr
fetchTotal :: String -> String -> String -> [String]
fetchTotal [] _ _ = []
fetchTotal str headStr tailStr = case fetchHead str headStr of
[] -> []
strA -> case fetchTail strA tailStr of
(h,t) -> h : fetchTotal t headStr tailStr
fetchHead ::String -> String -> String
fetchHead [] _ = []
fetchHead str parstr = case findSubstring parstr str of
Nothing -> []
Just strIndex -> Prelude.drop (strIndex + (Data.List.length parstr)) str
fetchTail :: String -> String->(String,String)
fetchTail [] _ = ([],[])
fetchTail str parstr = case findSubstring parstr str of
Nothing -> ([],[])
Just strIndex -> (Prelude.take strIndex str,Prelude.drop (strIndex + (Data.List.length parstr)) str)
-}
{-
result <- get "http://www.kanshu.com/files/article/html/111220/"
result^.responseStatus
result^.responseStatus.statusCode
BS.putStrLn $ result^.responseBody
-}