from lxml import etree
import re
tree = etree.parse("./爬虫.xpath示例.html")
questions = tree.xpath('//div[@class="question"]/text()')
questions = list(map(lambda x: re.sub('\s', '', x), questions))
questions = list(map(lambda x: re.sub('\d+\.', '', x), questions))
print(questions)
selects = tree.xpath('//div[@class="answer"]')
options = []
for key, values in enumerate(selects):
item = values.xpath('.//span[@class="el-radio__label"]/span/text() | .//span[@class="el-checkbox__label"]/span/text()')
itemStr = ';\n'.join(item)
options.append(itemStr)
print(options)
answers = tree.xpath('//div[@class="rightAnswer"]/text()')
answers = list(map(lambda x: re.sub('\s', '', x), answers))
print(answers)
import pandas as pd
data = pd.DataFrame({
"题目":questions,
"选项": options,
"答案": answers
})
print(data)
data.to_excel('./爬虫.xpath结果.xls', index=False)