Skip to content

Latest commit

 

History

History
43 lines (34 loc) · 979 Bytes

爬虫.xpath学习与实践.md

File metadata and controls

43 lines (34 loc) · 979 Bytes

Xpath 学习

from lxml import etree
import re

tree = etree.parse("./爬虫.xpath示例.html")
questions = tree.xpath('//div[@class="question"]/text()')
questions = list(map(lambda x: re.sub('\s', '', x), questions))
questions = list(map(lambda x: re.sub('\d+\.', '', x), questions))
print(questions)
selects = tree.xpath('//div[@class="answer"]')
options = []
for key, values in enumerate(selects):
    item = values.xpath('.//span[@class="el-radio__label"]/span/text() | .//span[@class="el-checkbox__label"]/span/text()')
    itemStr = ';\n'.join(item)
    options.append(itemStr)
print(options)
answers = tree.xpath('//div[@class="rightAnswer"]/text()')
answers = list(map(lambda x: re.sub('\s', '', x), answers))
print(answers)
import pandas as pd

data = pd.DataFrame({
    "题目":questions,
    "选项": options,
    "答案": answers
})

print(data)

data.to_excel('./爬虫.xpath结果.xls', index=False)