selenium爬取企查查案例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 驱动路径
path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
# 创建浏览器对象
driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)
url = 'https://www.qichacha.com/'
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_xpath('//input[@id="searchkey"]').send_keys(f'{company_name}')
driver.find_element_by_xpath("//input[@value ='查一下']").click()
driver.find_element_by_xpath("//a[@class ='ma_h1']").click()

cookies = driver.get_cookies()

cookies_list= []

for cookie_dict in cookies:
cookie =cookie_dict['name']+'='+cookie_dict['value']
cookies_list.append(cookie)


header_cookie = ';'.join(cookies_list)
print(header_cookie)

headers2 = {
'cookie':header_cookie,
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
fin_url = driver.current_url
response = requests.get(fin_url,headers=headers2)
mainhandle=driver.current_window_handle #主页面句柄 每个浏览器标签页都有一个句柄
# print(response.text)
handles = driver.window_handles
for handle in handles:# 轮流得出标签页的句柄 切换窗口 因为只有两个标签页实际是假for循环
if handle!=mainhandle:
driver.switch_to_window(handle)
#获得数据
try:
raw=driver.find_element_by_xpath("//table[@class='ntable']")
print (raw.text)
for data1 in raw.text.split('\n'):
if '-' in data1:
continue
#流通市值
ltsz = data1.split(' ')[3]
#市盈率
syl = data1.split(' ')[1]
#市净率
sjl = data1.split(' ')[3]
print(ltsz,syl,sjl)
except Exception as e:
print("无该数据")