1.目前問題:運(yùn)行該py文件,沒有任何反應(yīng),請問是什麼問題?
2.純新手,沒接觸過py,只學(xué)完初淺的html css。但由於公司一個(gè)專案需要蒐集區(qū)域的供應(yīng)商信息,想到了py爬蟲,便在一個(gè)科學(xué)怪咖的網(wǎng)站找到了一個(gè)爬取阿里巴巴的案例源碼
3.根據(jù)案例所述,安裝好了py2 .7.13、pip、selenium和火狐瀏覽器...
#4.聯(lián)絡(luò)了作者僅需要修改淘寶帳號(hào)密碼及搜尋頁面的url即可,但是沒反應(yīng),用的是作者在git上的源碼
5.難道是要等好久好久,還是哪裡出了問題?網(wǎng)路上關(guān)於此類的問題比較少,所以特請教下
6.源碼如下:
#! /usr/bin/env python
# coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import urllib
import urllib2
import sys
import os
import re
import csv
import numpy as np
# 解決中文報(bào)錯(cuò)的問題
reload(sys)
sys.setdefaultencoding('utf-8')
# 打開一個(gè)火狐瀏覽器
driver = webdriver.Firefox()
# 睡眠3秒,防止瀏覽器還沒打開就進(jìn)行了其他操作
time.sleep(3)
# 化工商戶頁面的url
url = 'https://s.1688.com/company/company_search.htm?' \
'keywords=%BC%E0%BF%D8&city=%C9%EE%DB%DA&province=%B9%E3%B6%AB&n=y&filt=y'
# 登錄的url
login_url = 'https://login.1688.com/member/signin.htm?'
# 跳轉(zhuǎn)到登錄頁面
driver.get(login_url)
# 睡眠5秒,防止網(wǎng)速較差打不開網(wǎng)頁就進(jìn)行了其他操作
time.sleep(5)
# 找到賬號(hào)登錄框的DOM節(jié)點(diǎn),并且在該節(jié)點(diǎn)內(nèi)輸入賬號(hào)
driver.find_element_by_name("TPL_username").send_keys('')
# 找到賬號(hào)密碼框的DOM節(jié)點(diǎn),并且在該節(jié)點(diǎn)內(nèi)輸入密碼
driver.find_element_by_name("TPL_password").send_keys('')
# 找到賬號(hào)登錄框的提交按鈕,并且點(diǎn)擊提交
driver.find_element_by_name("TPL_password").send_keys(Keys.ENTER)
# 睡眠5秒,防止未登錄就進(jìn)行了其他操作
time.sleep(5)
# 跳轉(zhuǎn)到化工商戶頁面的url
driver.get(url)
# 新建一個(gè)data.csv文件,并且將數(shù)據(jù)保存到csv中
csvfile = file('data.csv', 'web')
writer = csv.writer(csvfile)
# 寫入標(biāo)題,我們采集企業(yè)名稱,主頁,產(chǎn)品,聯(lián)系人,電話和地址信息
writer.writerow((
u'企業(yè)名稱'.encode('gbk'),
u'主頁'.encode('gbk'),
u'產(chǎn)品'.encode('gbk'),
u'聯(lián)系人'.encode('gbk'),
u'電話'.encode('gbk'),
u'地址'.encode('gbk')
))
# 構(gòu)建agents防止反爬蟲
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;.NET CLR 1.1.4322; .NET CLR2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5(like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
# 總共有100頁,使用for循環(huán)采集
for page in xrange(1, 100):
# 捕捉異常
try:
# 獲取企業(yè)名稱列表
title = driver.find_elements_by_css_selector("a[class=list-item-title-text]")
# 獲取產(chǎn)品
product = driver.find_elements_by_xpath("http://p[@class=\"list-item-detail\"]/p[1]/p[1]/a[1]")
# 打印長度,調(diào)試
print len(title)
# 定義正則匹配每條商戶
pattern = re.compile('<p class="contcat-desc".*?>(.*?)</p>', re.S)
# 定義電話正則
tel_pattern = re.compile('<dd>(.*?)</dd>', re.S)
# 定義移動(dòng)電話正則
member_name_pattern = re.compile('<a.*?class="membername".*?>(.*?)</a>', re.S)
# 定義地址正則
address_pattern = re.compile('"address">(.*?)</dd>', re.S)
for i in xrange(len(title)):
# 獲取標(biāo)題的值
title_value = title[i].get_attribute('title')
# 獲取跳轉(zhuǎn)的url
href_value = title[i].get_attribute('href') + 'page/contactinfo.htm'
# 獲取經(jīng)營范圍
product_value = product[i].text
# 隨機(jī)選擇agent進(jìn)行訪問
agent = np.random.choice(user_agents)
# 組建header頭部
headers = {'User-Agent': agent, 'Accept': '*/*', 'Referer': 'http://www.google.com'}
# 使用urllib2進(jìn)行Request
request = urllib2.Request(href_value, headers=headers)
# 訪問鏈接
response = urllib2.urlopen(request)
# 獲得網(wǎng)頁源碼
html = response.read()
# 進(jìn)行信息匹配
info = re.findall(pattern, html)
try:
info = info[0]
except Exception, e:
continue
tel = re.findall(tel_pattern, info)
try:
tel = tel[0]
tel = tel.strip()
tel = tel.replace(' ', '-')
except Exception, e:
continue
member_name = re.findall(member_name_pattern, html)
try:
member_name = member_name[0]
member_name = member_name.strip()
except Exception, e:
continue
address = re.findall(address_pattern, html)
try:
address = address[0]
address = address.strip()
except Exception, e:
address = ''
# 打印出信息,方便查看進(jìn)度
print 'tel:' + tel
print 'member_name:' + member_name
data = (
title_value.encode('gbk', 'ignore'),
title[i].get_attribute('href'),
product_value.encode('gbk', 'ignore'),
member_name,
tel,
address
)
writer.writerow(data)
js = 'var q=document.documentElement.scrollTop=30000'
driver.execute_script(js)
time.sleep(1)
page = driver.find_elements_by_css_selector("a[class=page-next]")
page = page[0]
page.click()
time.sleep(2)
except Exception, e:
print 'error'
continue
# 關(guān)閉csv
csvfile.close()
# 關(guān)閉模擬瀏覽器
driver.close()
閉關(guān)修行中......
前提你你裝好python之後配置好python的環(huán)境變量,也就說你在cmd命令行可以運(yùn)行python命
然後進(jìn)到py文件所在目錄,執(zhí)行
python -u alibaba.py