[Python]爬取小说
最近学了学爬虫,初窥爬虫
爬取网上某网站小说
代码如下
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# @Time : 2020/9/17 08:12
4# @Author : Aen
5# @File : 10.py
6# @Software: PyCharm
7# @function: 爬取小说
8
9import requests
10import fake_useragent
11from bs4 import BeautifulSoup
12if __name__ == '__main__':
13 # url = "https://www.xsbiquge.com/20_20331/"
14 url = input('address:')
15 filename = input('filename:')
16 headers = {
17 "User-Agent": fake_useragent.UserAgent().chrome
18 }
19 # print(requests.get(url=url, headers=headers).encoding)
20 # print(requests.get(url=url, headers=headers).text.encode('ISO-8859-1').decode('utf-8'))
21 soup = BeautifulSoup(requests.get(url=url, headers=headers).text.encode('ISO-8859-1').decode('utf-8'), 'lxml')
22 li_list = soup.select('.box_con >div > dl > dd')
23
24 fp = open('./'+filename+'.txt', 'w', encoding='utf-8')
25 for li in li_list:
26 # print(li)
27 title = li.a.string
28 address = 'https://www.xsbiquge.com'+li.a['href']
29 # print(address)
30 # print(requests.get(url=address, headers=headers).text.encode('ISO-8859-1').decode('utf-8'))
31 get_text = requests.get(url=address, headers=headers).text.encode('ISO-8859-1').decode('utf-8')
32 soup2 = BeautifulSoup(get_text, 'lxml')
33 # print(soup2.find('div', id='content').text)
34 # print(soup2.select('.content'))
35 fp.write(title+':\n'+soup2.find('div', id='content').text+'\n')
36 print(title, '抓取成功')
37 fp.close()
38
评论
0 评论