爬虫代码

####urllib

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from urllib import request
****************************************基本********************************************
#确定url地址
base_url = 'http://www.baidu.com/'
#发送请求
response = request.urlopen(base_url)
#获取相应内容
html = response.read()
print(html.decode('utf-8'))
#存储
with open('2.html','w',encoding='utf-8') as k:
k.write(html.decode('utf-8'))
***************************************添加headers********************************************
base_url = 'http://www.baidu.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
req = request.Request(url = base_url,headers=headers)
response = request.urlopen(req)
html = response.read()
print(html.decode('utf-8'))
****************************************未登录跳过登录爬取人人网********************************
from urllib import request,parse
from http import cookiejar
#实例化cookie管理器
cookie = cookiejar.CookieJar()
cook_handle = request.HTTPCookieProcessor(cookie)
#创建opener
opener = request.build_opener(cook_handle)
base_url = 'http://www.renren.com/PLogin.do'
data = {
'email':'17333119189',
'password':'19960102kuai'
}
res = parse.urlencode(data)
re = request.Request(base_url,data=res.encode('utf-8'))
responsd = opener.open(re)
******************************************添加代理****************************************
from urllib import request
base_url = 'http://www.66ip.cn/'
proxy = {
'http':'alice:123456@120.78.166.84:6666',
'https':'alice:123456@120.78.166.84:6666'
}
#创建管理器
proxy_handler = request.ProxyHandler(proxy)
#创建opener
opener = request.build_opener(proxy_handler)
headers = {
'Cookie':' __jsluid=27be3fa08d7aa7c457f79068cd77bc79; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1555141241,1555144721,1555152744,1555155158; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1555155531',
'User-Agent':' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
response = opener.open(base_url)
print(response.read().decode('gb2312'))
import requests
base_url = 'http://www.baidu.com'
#发送请求
response = requests.get(base_url)
#获取相应内容===1.获取字符串2.获取二进制bytes
#指定编码格式
print(response)
response.encoding='utf-8'
print(response.text)
#返回bytes格式 content
print(response.content.decode('utf-8'))
****************************************非认证代理****************************************
#使用代理ip爬取网页
#非认证代理
from urllib import request
base_url = 'http://www.baidu.com'
proxy = {
'http':'http://124.205.143.213:41372',
'https':'http://124.205.143.213:41372'
}
#创建proxy管理器
proxy_handler = request.ProxyHandler(proxy)
#创建opener
opener = request.build_opener(proxy_handler)
#发送请求
response = opener.open(base_url)
print(response.read().decode('utf-8'))
*****************************************认证代理*****************************************
from urllib import request
base_url = 'http://www.baidu.com'
proxy = {
'http':'http://alice:123456@120.78.166.84:6666',
'https':'https://alice:123456@120.78.166.84:6666'
}
#创建ip处理器
proxy_handler = request.ProxyHandler(proxy)
#chuangjianoperer
opener = request.build_opener(proxy_handler)
#发送请求
response = opener.open(base_url)
print(response.read().decode('utf-8'))

requests

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
*********************************************get*****************************************
import requests
base_url = 'https://www.xicidaili.com/'
#定义请求头
headers = {
'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
res = requests.get(base_url,headers=headers)
print(res.text)
***********************************************post***************************************
import requests,json
base_url = 'https://fanyi.baidu.com/sug'
data = {
'kw': 'hello'
}
response = requests.post(base_url,data=data)
response.encoding='UTF-8'
res = json.loads(response.text)
# with open('2.json','w',encoding='utf-8')as f:
# f.write(json.loads())
print(res)
******************************************添加请求头**************************************
import requests
base_url = 'http://www.baidu.com'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(base_url,headers=headers)
print(response.text)

********************************************xpath匹配*************************************
from lxml import etree
import requests
base_url = 'https://www.mzitu.com/'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
,'referer':'https://www.mzitu.com/xinggan/',
'cookie': 'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1555144582,1555144596,1555144615,1555145046; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1555145072'
}
respond = requests.get(base_url,headers = headers)
# print(respond.text)
html = etree.HTML(respond.text)
html_xpath = html.xpath('//a/img[@class="lazy"]/@data-original')
# print(html_xpath)
for i in html_xpath:
respond = requests.get(i,headers=headers)
j = i[-14:-4]
with open('./pic/{}.jpg'.format(j),'wb') as f:
f.write(respond.content)
*******************************************添加代理***************************************
import requests
base_url = 'http://www.baidu.com'
proxy={
'http':'http://alice:123456@120.78.166.84:6666',
'https':'https://alice:123456@120.78.166.84:6666'
}
response = requests.get(base_url,proxies= proxy)
response.encoding='utf-8'
print(response.text)
********************************************综合*******************************************
import requests,re
from lxml import etree
from urllib import parse
base_url = 'http://www.ccdi.gov.cn/'
proxy = {
'http':'http://alice:123456@120.78.166.84:6666',
'https':'https://alice:123456@120.78.166.84:6666'
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
response = requests.get(base_url,headers=headers,proxies =proxy)
# with open('3.html','wb')as f:
# f.write(response.content)
html_ele = etree.HTML(response.text)
url_list = html_ele.xpath('//nav/span[5]/a/@href')
# print(url_list)
response_jdbgpage = requests.get(url_list[0],headers = headers,proxies =proxy)
# with open('3.html','wb')as f:
# # f.write(response_jdbgpage.content)
html = etree.HTML(response_jdbgpage.text)
url2_list = html.xpath('//ul[@class="menu_list"]/li[position()>1]/a/@href')[0:3]
# print(url2_list)
for url3 in url2_list:
new_url = parse.urljoin(url_list[0],url3)
print(new_url)
resp = requests.get(new_url,headers = headers,proxies =proxy)
# with open('3.html','wb')as f:
# f.write(resp.content)
pat = re.compile('<li class="on"><a href="../../(.*?)/">')
res = pat.findall(resp.text)[2:-1]
print(res)
for i in res:
base_url2 = 'http://www.ccdi.gov.cn/special/jdbg3/{}/'
# print(base_url2.format(i))
res2 = requests.get(base_url2.format(i),headers=headers)
# with open('4.html','wb')as f:
# f.write(res2.content)
pat = re.compile('<a href="(.*?)"')
print(pat.findall(res2.text))
******************************************多进程共享数据********************************
#爬取中央纪检委举报违纪的实例
import requests,re
from lxml import etree
from day14.day14_2 import Db
from multiprocessing import Pool,Manager

#爬取基本页信息
def First_webpage(url,queue):
print('函数1',url)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
first_webpage_response = requests.get(url,headers=headers,proxies = proxy())
#匹配所有省份
pat = re.compile('<a href="./(.*?)/">')
province_list = pat.findall(first_webpage_response.text)
print(province_list)
for province in province_list[2:-1]:
type_list = ['fjbxgdwt_jdbg3', 'sffbwt_jdbg3', 'sfjds_jdbg3']
for type1 in type_list:
second_webpage_url = 'http://www.ccdi.gov.cn/special/jdbg3/{}/{}'.format(province,type1)
print(second_webpage_url)
queue.put((divoce_page_url,second_webpage_url))
# 函数2 http://www.ccdi.gov.cn/special/jdbg3/bt_bgt/sfjds_jdbg3
def divoce_page_url(url,queue):
print('函数2',url)
response = requests.get(url,proxies = proxy())
pat = re.compile('createPageHTML\((.*?), 0, "index", "html"\)')
page_count = pat.findall(response.text)
for page in range(int(page_count[0])):
if page == 0:
page_url = url + '/index.html'
else:
page_url = url + '/index_{}.html'.format(page)
response = requests.get(page_url,proxies = proxy())
pat = re.compile('<li class="fixed">\s+<dl>\s+<dt>\s+<a href="./(.*?)" target="_blank"')
url_list = pat.findall(response.text)
for url_content in url_list:
new_url = url +'/'+ url_content
queue.put((get_content_url,new_url))
def get_content_url(url):
print('函数3:',url)
response = requests.get(url,proxies = proxy())
response.encoding='utf-8'
html = etree.HTML(response.text)
content_list = html.xpath('//div[@class="content"]/div[1]/div[@class="TRS_Editor"]//text()')
content = '\n'.join([item.strip() for item in content_list if item.strip() != ''])
print(content)
def proxy():
proxy = {
'http':'http://alice:123456@120.78.166.84:6666',
'https':'https://alice:123456@120.78.166.84:6666'
}
return proxy
if __name__ == '__main__':
pool = Pool(5)
url = 'http://www.ccdi.gov.cn/special/jdbg3/index.html'
queue = Manager().Queue()
queue.put((First_webpage,url))
while True:
func,url=queue.get()
pool.apply_async(func=func,args=(url,queue))

class Db:
def __init__(self,database='kl',user='root',password='123456',port=3306,host='localhost'):
self.db = pymysql.connect(database=database,port=port,password=password,user=user,host=host,charset='utf8mb4')
self.cursor = self.db.cursor()
def insert(self,sql,data):
self.cursor.execute(sql,data)
self.db.commit()
def update(self,sql,data):
self.cursor.execute(sql,data)
self.db.commit()
def __del__(self):

self.cursor.close()
self.db.close()
beautifulsoup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from bs4 import BeautifulSoup
text = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title p8" id="p1" data="1"><b>The Dormouse's story</b><b>----2b</b></p>

<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span><b>Elsie</b></span></span>--alice</span></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>

<p class="story">...</p>
<div class="item1">我是div</div>
<div class="item2">我是div2</div>
我是body
"""
#实例化对象
soup = BeautifulSoup(text,'lxml')
# #格式化
# print(soup.prettify())
#取第一个元素
print(soup.div)
#获取标签名
print(soup.div.name)
#获取文本内容
print(soup.div.string)
#获取标签中的所有内容包括子标签内的内容
print(soup.body.get_text)
#获取属性的方法
print(soup.p['id'])
#获取当前标签的所有的属性和值
print(soup.p.attrs)

#find_all
print(soup.find_all('div'))

#find
print(soup.find('p'))
#元素节点
print(soup.body.contents)
print(soup.body.descendants)

#css选择器
print(soup.select('p'))
print(soup.select('a.sister'))
res = soup.select('p>b')
print(res)
res = soup.select('#p1,.story')
print(res)
scrapy框架
1
2
3
4
5
流程:1.cd 到想要创建scrapy框架的文件夹里
2.scrapy startproject 框架名
3.生成爬虫程序scrapy genspider 程序名 网址
4.执行爬虫程序 scrapy crawl 爬虫名字
5.调试平台scrapy shell
selenium
1
2


db类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import pymysql
class MysqlDatabase:
def __init__(self):
self.db = pymysql.connect(host='localhost',user = 'root',password='123456',port = 3306,database = 'kl',charset = 'utf8mb4')
self.cursor = self.db.cursor()
def insert(self,sql,data):
try:
self.cursor.execute(sql,data)
self.db.commit()
except:
print('插入失败')
def update(self,sql,data):
try:
self.cursor.execute(sql, data)
self.db.commit()
except:
print('更新失败')
def __del__(self):
self.cursor.close()
self.db.close()
proxy代理类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests,time
class ProxyHelper:
def __init__(self):
self.url = 'http://mvip.piping.mogumiao.com/proxy/api/get_ip_al?appKey=4015b4545c6545d5b21a3b49fec0671c&count=1&expiryDate=0&format=2&newLine=2'
self.version = 0
def get_proxy(self):
print('获取了一个代理')
self.proxy = requests.get(self.url).text.strip()
return self.proxy,self.version
def update_proxy(self,version):
if version == self.version:
self.proxy,self.version = self.get_proxy()
self.version += 1
print('更新了一个代理:'+self.proxy)
return self.proxy
if __name__ == '__main__':
helper = ProxyHelper()
# proxy, version = helper.get_proxy()
# print(proxy)
# time.sleep(30)
helper.update_proxy(0)
print(helper.update_proxy(0))