python scrapy爬取網(wǎng)站數(shù)據(jù)一
來源:風(fēng)水月
發(fā)布時(shí)間:2018-07-03 17:24:48
閱讀量:1227
原來寫過一篇scrapy的介紹,說了下scrapy的環(huán)境如何配置,該篇博客地址是:win10 python安裝及環(huán)境配置、scrapy框架安裝及PyCharm集成
本篇會(huì)從一個(gè)實(shí)際的例子當(dāng)中記錄scrapy的使用
大家都對三國很熟,下面我們從 三國在線(http://www.e3ol.com/biography-index.html)來獲取三國人物數(shù)據(jù),獲取三國人物數(shù)據(jù)的整體代碼如下,本代碼抓取數(shù)據(jù)的網(wǎng)址返回的是JSON格式的數(shù)據(jù),本代碼將解析該JSON數(shù)據(jù),并將其按json的鍵創(chuàng)建數(shù)據(jù)表,保存人物信息
import scrapyimport jsonimport pymysqlimport refrom sgyyScrapy.items import SgyyscrapyItem
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}class sgyyScrapy(scrapy.Spider):
name = "sgyyScrapy"
allowed_domins = ["http://www.e3ol.com/"]
start_urls = []
isCreateTable = False
def start_requests(self):
global headers # 三國在線 通過主效勢力去選 完整地址 http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s&pageno=1
urlhead = 'http://www.e3ol.com/biography/inc_ajax.asp?types=index&a2=%s'
for i in range(14):
baseUrl = urlhead % (i+1) + '&pageno=%s'
for qy in range(50):
url = baseUrl % (qy+1)
self.start_urls.append(url) for url in self.start_urls: # print (url)
yield scrapy.Request(url, headers=headers, callback=self.parse) def parse(self, response):
jsonStr = response.body_as_unicode() # 返回的數(shù)據(jù)是unicode編碼,中文都被解析成\u4e2d\u6587(中文)這類的字符串了,所以通過下面的方法將其轉(zhuǎn)換成中文
encodeStr = jsonStr.encode('utf-8').decode('unicode_escape')
encodeJsonStr = encodeStr[1:len(encodeStr)-1]
encodeJsonStr = encodeJsonStr.replace(" ","")
print(encodeJsonStr) # 返回的json的key沒有‘’,會(huì)導(dǎo)致json.loads出錯(cuò),在此給key添加''
reEncodeStr = self.quote_keys_for_json(encodeJsonStr)
print(reEncodeStr) # JSON轉(zhuǎn)換成對象
jsonObject = json.loads(reEncodeStr.replace("'", "\"")) # 數(shù)據(jù)庫連接
db = pymysql.connect(host = "127.0.0.1", port = 3306, user = "root",password = "zhl",database = "sgyy",charset='utf8')
cursor = db.cursor() for item in jsonObject['soul']:
joi = 0
jsonObjectNum = len(item)
createSQL = ""
insertSQL = ""
insertSQLValue = ""
if self.isCreateTable == False: for key in item:
joi = joi + 1
if joi >= jsonObjectNum:
createSQL = createSQL + key + " varchar(1000))"
insertSQL = insertSQL + key + ")"
insertSQLValue = insertSQLValue + "'%s')" % item[key] else: if joi == 1:
createSQL = "create table sgyy_person(" + key + " varchar(1000),"
insertSQL = "insert into sgyy_person(" + key + ","
insertSQLValue = insertSQLValue + " values ('%s'," % item[key] else:
createSQL = createSQL + key + " varchar(1000),"
insertSQL = insertSQL + "" +key +","
insertSQLValue = insertSQLValue + "'%s'," % item[key] try:
print(createSQL)
cursor.execute("DROP TABLE IF EXISTS sgyy_person")
cursor.execute(createSQL)
insertFinal = insertSQL+insertSQLValue
print(insertFinal)
cursor.execute(insertFinal)
db.commit() except:
print("發(fā)生錯(cuò)誤,回滾事務(wù)")
db.rollback()
self.isCreateTable = True
else: for key in item:
joi = joi + 1
if joi >= jsonObjectNum:
insertSQL = insertSQL + key + ")"
insertSQLValue = insertSQLValue + "'%s')" % item[key] else: if joi == 1:
insertSQL = "insert into sgyy_person(" + key + ","
insertSQLValue = insertSQLValue + " values ('%s'," % item[key] else:
insertSQL = insertSQL + "" +key +","
insertSQLValue = insertSQLValue + "'%s'," % item[key] try:
insertFinal = insertSQL + insertSQLValue
print(insertFinal)
cursor.execute(insertFinal)
db.commit() except:
print("發(fā)生錯(cuò)誤,回滾事務(wù)")
db.rollback()
cursor.close()
db.close()
print("結(jié)束") def quote_keys_for_json(self,json_str):
# """給鍵值不帶雙引號的json字符串的所有鍵值加上雙引號。
# 注:解析一般的不嚴(yán)格的json串,可以checkout https://github.com/dmeranda/demjson, 速度比標(biāo)準(zhǔn)庫要慢。"""
quote_pat = re.compile(r'".*?"')
a = quote_pat.findall(json_str)
json_str = quote_pat.sub('@', json_str)
key_pat = re.compile(r'(\w+):')
json_str = key_pat.sub(r'"\1":', json_str) assert json_str.count('@') == len(a)
count = -1
def put_back_values(match):
nonlocal count
count += 1
return a[count]
json_str = re.sub('@', put_back_values, json_str) return json_str
原文地址https://blog.csdn.net/fengshuiyue/article/details/80857875