最近用selenium写了一个小爬虫,需要循环在搜索框内输入内容,然后模拟点击搜索,在前进的新的页面获取数据,之后循环之前步骤,知道搜索结束。在爬虫刚开始运行的时候速度还可以,大约一秒一个页面,随着运行,速度开始越来越慢。
这里贴出代码:
browser = webdriver.Chrome(executable_path="D:\GeckoDriver\chromedriver") browser.get("https://www.qcc.com/") # #4.设置浏览器的大小 browser.maximize_window() login = browser.find_element_by_xpath('/html/body/header/div/ul/li[10]/a') login.click() # sleep(30) # print("30 seconds later") x = input("登录后请按y") cookies = browser.get_cookies() browser.quit() browser = webdriver.Chrome(executable_path="D:\GeckoDriver\chromedriver",options = chrome_options) # , options = chrome_options browser.get("https://www.qcc.com/") for cookie in cookies: browser.add_cookie(cookie) browser.get("https://www.qcc.com/") browser.maximize_window() qccinput = browser.find_element_by_css_selector("#searchkey") # qccinput.clear() qccinput.send_keys(companyNames[random.randint(0, len(companyNames))]) qccbutton = browser.find_element_by_css_selector(".index-searchbtn") sleep(0.5) qccbutton.click() qccbutton = browser.find_element_by_css_selector(".input-group-btn") sleep(0.5) qccbutton.click() pbar = tqdm(range(len(companyNames))) for companyName, i in zip(companyNames, pbar): browser.forward() # browser.delete_all_cookies() # browser.refresh(); lem = WebDriverWait(browser, 15, 0.5).until(EC.presence_of_element_located((By.ID, "searchKey"))) # 节约时间,网页出现这个元素再操作 seach = browser.find_element_by_css_selector("#searchKey") seach.clear() seach.send_keys(companyName) seachButton = browser.find_element_by_css_selector(".btn-primary") seachButton.click() response = browser.page_source html = etree.HTML(response) result = etree.tostring(html) cookies = browser.get_cookies() try: companyName = html.xpath( 'normalize-space(/html/body/div[1]/div[2]/div[2]/div[3]/div/div[2]/div/table/tr[1]/td[3]/div/a[1])') # 去掉tbody urls = html.xpath( '/html/body/div[1]/div[2]/div[2]/div[3]/div/div[2]/div/table/tr[1]/td[3]/div/a[1]/@href') # 去掉tbody getCompanyNames.append(companyName) getTaxpayerNumber(urls, cookies) except Exception as r: getCompanyNames.append("无法搜索到公司") taxpayerNumber.append("无法查到税号") # browser = webdriver.Chrome(profile) # send_command = ('POST', '/session/$sessionId/chromium/send_command') # browser.command_executor._commands['SEND_COMMAND'] = send_command # browser.execute('SEND_COMMAND', dict(cmd='Network.clearBrowserCache', params={})) browser.delete_all_cookies() for cookie in cookies: browser.add_cookie(cookie)