Python Web Crawler Pt.6 - Requests & PyQuery

Requests is an elegant and simple HTTP library for Python, built for human beings.

专门为人类设计的优雅而简单的HTTP库。

首先你应该安装好了Requests库，那么让我们开始吧。

1	import requests as R

快速开始 - QuickStart

我们先用requests来下个文件，来自于古腾堡计划网站， William Shakespeare - Romeo and Juliet。

res = R.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
type(res) 
# requests.models.Response
res.status_code == requests.codes.ok ## 200 请求成功
# True
len(res.text) 
# 179378
print(res.text[:250]) 
# The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare...

当我们请求不存在的页面时，可能会遇到404，这时我们可以用一下方法，

import requests 
res = requests.get('http://inventwithpython.com/page_that_does_not_exist') 
try: 
    res.raise_for_status() ## 如果出错，将抛出异常
except Exception as exc: 
    print('There was a problem: %s' % (exc)) 
# There was a problem: 404 Client Error: Not Found for url: http://inventwithpython.com/page_that_does_not_exist

Tips: 如果你访问的某个网站需要代理，可能会遇到：

TimeoutError: 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。
NewConnectionError: 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。
MaxRetryError: 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败
ConnectionError: 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。

接下来我们使用标准的 open()函数和 write()方法，保存这个文件, iter_content()方法在循环的每次迭代中，返回一段bytes数据，这里假设一段包含10万个字节，这样的话，即使在下载巨大的文件时也不会消耗太多内存。

with open("William Shakespeare - Romeo and Juliet.txt", 'wb') as rj:
    for chunk in res.iter_content(100000): 
        rj.write(chunk) 
    rj.close()

import json
r = R.get('https://api.github.com/user', auth=('user', 'pass')) ## your auth
r.headers['content-type']
# 'application/json; charset=utf8'
r.encoding
# 'utf-8'
r.json()
# {'login': 'Joaxin',
#  ...
# 'private_repos': 10000}}
r.text.encode("utf-8")
#  b'{"login":"...#
r.url
# 'https://api.github.com/user'

Make a Request

print(R.get('http://httpbin.org/get').text)
# {
#   "args": {}, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Host": "httpbin.org", 
#     "User-Agent": "python-requests/2.18.4"
#   }, 
#   "origin": "115.196.156.112", 
#   "url": "http://httpbin.org/get"
# }
print(R.post('http://httpbin.org/post',data = {'animal':'cat','tags':['persian','friendly']}).text)
# {
#   "args": {}, 
#   "data": "", 
#   "files": {}, 
#   "form": {
#     "animal": "cat", 
#     "tags": [
#       "persian", 
#       "friendly"
#     ]
#   }, 
#   ...
print(R.delete('https://httpbin.org/delete'))
# <Response [200]>

Parameters In URLs

payload = {
    'song1': 'Into the Well' ,
    'song2':'Lift Me Up'
}
print(R.get("http://httpbin.org/get", params=payload))
# <Response [200]>
print(R.get("http://httpbin.org/get", params=payload).text) # str
# <Response [200]>
# {
#   "args": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
#   "headers": {
#  ...
# }
print(R.post("http://httpbin.org/post", params=payload, data=payload).text) # str
# {
#   "args": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
#   "data": "", 
#   "files": {}, 
#   "form": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
# ...
# }
print(R.get('http://httpbin.org/get?song1=Into+the+Well&song2=Lift+Me+Up&song2=Goodnight+%26+Goodbye').json())
# {'args': {'song1': 'Into the Well', 'song2': ['Lift Me Up', 'Goodnight & Goodbye']}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate',\
# 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'},\
# 'origin': '115.196.156.112', 'url': 'http://httpbin.org/get?song1=Into+the+Well&song2=Lift+Me+Up&song2=Goodnight+%26+Goodbye'}
print(R.post("http://httpbin.org/post", params=payload,data=json.dumps(payload)).text)
# {
#   "args": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
#   "data": "{\"song1\": \"Into the Well\", \"song2\": \"Lift Me Up\"}", 
#   "files": {}, 
#   "form": {}, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Content-Length": "49", 
#     "Host": "httpbin.org", 
#     "User-Agent": "python-requests/2.22.0"
#   }, 
#   "json": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
# ...
# }

抓取数据 - Grab data

Cookies

Bing

req = R.get('https://www.bing.com/search?q=bing',timeout = 5)
print(req.history)
# [] 

for key, value in req.cookies.items():   ## RequestCookieJar
    print(key + '=' + value)
# MUID=05319260E985628908A499CFE832631F
# SRCHD=AF=NOFORM
# ...

Examples of Zhihu

Zhihu

import re
headers = {
    'Cookie': '...',
    'origin': '...',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
    'referer': '...',
}
req = R.get("https://www.zhihu.com/explore", headers=headers)
exit() if not req.status_code == R.codes.ok else print('Request Successfully')

pattern = re.compile('<a.*?question_link.*?>(.*?)</a>', re.S)
titles = re.findall(pattern, req.text)
print(titles)

pattern2 = re.compile('<a.*?zu-top-nav-userinfo.*?<span.*?>(.*?)</span>.*?Avatar', re.S)
username = re.findall(pattern2, req.text)
print(username)

Request Successfully
['\n为什么扁桃体发炎会引起耳鸣和鼻炎？\n', '\n对女性来说，健身有可能是把毁容刀么？是否有哪些不适合女性健身的动作？\n', '\n有哪些有分量的动画奖项？\n', '\n如何看待美团开源的 mpvue ?\n', '\n有哪些作文素材或名言是你百试不爽的？\n', '\n有哪些名字配不上本尊的植物/动物吗？\n', '\n自然吸气发动机的线性是指什么和什么呈线性？\n', '\n有哪些眼前一亮的暗中称妙的成语？\n', '\n为了高考你有多拼命？\n', '\n中国有哪些值得拍成影视剧的历史或人物？\n', '\n乌克兰是不是真的很穷？\n', '\n在p社游戏过程中，有哪些让你对现实产生思考的时候？\n', '\n有哪些相见恨晚的TensorFlow小技巧？\n']
['Lyole']

RequestsCookieJar

cookies = headers['Cookie']
jar = R.cookies.RequestsCookieJar()
headers2 = {
    'Host': 'www.zhihu.com',
    'User-Agent': headers["User-Agent"]
}  
for cookie in cookies.split(';'):
    key, value = cookie.split('=', 1)
    jar.set(key, value)
req = R.get("https://www.zhihu.com/explore", cookies=jar, headers=headers2)
username = re.findall(pattern2, req.text)
print(username)

['Lyole']

Session

from requests import Request, Session
R.get('http://httpbin.org/cookies/set/number/1995')
req = R.get('http://httpbin.org/cookies')
print(req.text)
# {
#   "cookies": {}
# }

s = Session()
s.get('http://httpbin.org/cookies/set/number/1995')
req= s.get('http://httpbin.org/cookies')
print(req.text)
# {
#   "cookies": {
#     "number": "1995"
#   }
# }

url = 'http://httpbin.org/post'
data = {
    'Saycet': '15'
} 
headers3 = {
    'User-Agent': headers["User-Agent"]
}

s = Session()
req = Request('POST', url, data=data, headers=headers3)
prepped = s.prepare_request(req)
req = s.send(prepped)
print(req.text)
# {
#   "args": {}, 
#   "data": "", 
#   "files": {}, 
#   "form": {
#     "Saycet": "15"
#   }, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Content-Length": "9", 
#     "Content-Type": "application/x-www-form-urlencoded", 
#     "Host": "httpbin.org", 
#     "User-Agent": "User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 YaBrowser/18.2.0.284Yowser/2.5 Safari/537.36"
#   }, 
#   "json": null, 
#   "origin": "183.246.20.118", 
#   "url": "http://httpbin.org/post"
# }

SSL

Notes: 12306 finally supports https .

response = R.get('https://www.12306.cn')
print(response.status_code)
# SSLError: HTTPSConnectionPool
response = R.get('https://www.12306.cn', verify=False)
print(response.status_code)
# 200
# Warning:  Adding certificate verification is strongly advised.

import logging
logging.captureWarnings(True)
response = R.get('https://www.12306.cn', verify=False)
# response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
print(response.status_code)
# 200

Files

req= R.get("http://httpbin.org/image/png")
# print(req.text)
# print(req.content)
with open('pig.png', 'wb') as f:
    f.write(req.content)
 
# upload data
files = {'file': open('pig.png', 'rb')}
req= R.post("http://httpbin.org/post", files=files)
# print(req.text)

OAuth

from requests_oauthlib import OAuth1

url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
auth = OAuth1('YOUR_APP_KEY', 'YOUR_APP_SECRET',
              'USER_OAUTH_TOKEN', 'USER_OAUTH_TOKEN_SECRET')
requests.get(url, auth=auth)

状态码 - Status Code

_codes = {

    # Informational.
    100: ('continue',),
    101: ('switching_protocols',),
    102: ('processing',),
    103: ('checkpoint',),
    122: ('uri_too_long', 'request_uri_too_long'),
    200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
    201: ('created',),
    202: ('accepted',),
    203: ('non_authoritative_info', 'non_authoritative_information'),
    204: ('no_content',),
    205: ('reset_content', 'reset'),
    206: ('partial_content', 'partial'),
    207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
    208: ('already_reported',),
    226: ('im_used',),

    # Redirection.
    300: ('multiple_choices',),
    301: ('moved_permanently', 'moved', '\\o-'),
    302: ('found',),
    303: ('see_other', 'other'),
    304: ('not_modified',),
    305: ('use_proxy',),
    306: ('switch_proxy',),
    307: ('temporary_redirect', 'temporary_moved', 'temporary'),
    308: ('permanent_redirect',
          'resume_incomplete', 'resume',),  # These 2 to be removed in 3.0

    # Client Error.
    400: ('bad_request', 'bad'),
    401: ('unauthorized',),
    402: ('payment_required', 'payment'),
    403: ('forbidden',),
    404: ('not_found', '-o-'),
    405: ('method_not_allowed', 'not_allowed'),
    406: ('not_acceptable',),
    407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
    408: ('request_timeout', 'timeout'),
    409: ('conflict',),
    410: ('gone',),
    411: ('length_required',),
    412: ('precondition_failed', 'precondition'),
    413: ('request_entity_too_large',),
    414: ('request_uri_too_large',),
    415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
    416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
    417: ('expectation_failed',),
    418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
    421: ('misdirected_request',),
    422: ('unprocessable_entity', 'unprocessable'),
    423: ('locked',),
    424: ('failed_dependency', 'dependency'),
    425: ('unordered_collection', 'unordered'),
    426: ('upgrade_required', 'upgrade'),
    428: ('precondition_required', 'precondition'),
    429: ('too_many_requests', 'too_many'),
    431: ('header_fields_too_large', 'fields_too_large'),
    444: ('no_response', 'none'),
    449: ('retry_with', 'retry'),
    450: ('blocked_by_windows_parental_controls', 'parental_controls'),
    451: ('unavailable_for_legal_reasons', 'legal_reasons'),
    499: ('client_closed_request',),

    # Server Error.
    500: ('internal_server_error', 'server_error', '/o\\', '✗'),
    501: ('not_implemented',),
    502: ('bad_gateway',),
    503: ('service_unavailable', 'unavailable'),
    504: ('gateway_timeout',),
    505: ('http_version_not_supported', 'http_version'),
    506: ('variant_also_negotiates',),
    507: ('insufficient_storage',),
    509: ('bandwidth_limit_exceeded', 'bandwidth'),
    510: ('not_extended',),
    511: ('network_authentication_required', 'network_auth', 'network_authentication'),
}

PyQuery

a jquery-like library for python

Github: https://github.com/gawel/pyquery

Ok lets try download some data from https://www.zhihu.com/explore

Notes: code has been Out-of-date.

from pyquery import PyQuery as pq
 
headers = {
    'Cookie': '...',
    'Host': 'www.zhihu.com',
    'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 YaBrowser/18.2.0.284Yowser/2.5 Safari/537.36'
}
req = R.get("https://www.zhihu.com/explore", headers=headers).text
doc = pq(req)
items = doc('.explore-tab .feed-item').items()
for item in items:
    question = item.find('h2').text()
    print(question)
    author = item.find('.author-link-line').text()
    answer = pq(item.find('textarea').html()).text()
    file = open('Zhihu.txt', 'a', encoding='utf-8')
    file.write('\n'.join([question, author, answer]))
    file.write('\n' + '-' * 100 + '\n')
    file.close()

女生讨厌或不欣赏女生哪些行为？
作为医生看见面前一个暂时活蹦乱跳的喝了百草枯的病人是怎么的一种感觉？
如何看待外媒称日本以研修生名义,骗越南人到福岛清理核垃圾?
为什么大家肯定话剧演员的演技，但是越来越少的人去看话剧？85后为什么不愿意去剧院看话剧？
能不能推荐一些适合做壁纸的名画？
有哪些眼前一亮的暗中称妙的成语？
为了高考你有多拼命？
中国有哪些值得拍成影视剧的历史或人物？
乌克兰是不是真的很穷？
在p社游戏过程中，有哪些让你对现实产生思考的时候？