В справке к urllib.getproxies() нашел "It scans the environment for variables named
%load "C:\\Users\\kiss\\Anaconda\\Lib\\site-packages\\scrapy\\contrib\\downloadermiddleware\\httpproxy.py"
import base64
from urllib import getproxies, unquote, proxy_bypass
from urllib2 import _parse_proxy
from urlparse import urlunparse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.exceptions import NotConfigured
class HttpProxyMiddleware(object):
def __init__(self):
self.proxies = {}
for type, url in getproxies().items():
self.proxies[type] = self._get_proxy(url, type)
if not self.proxies:
raise NotConfigured
def _get_proxy(self, url, orig_type):
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
if user and password:
user_pass = '%s:%s' % (unquote(user), unquote(password))
creds = base64.b64encode(user_pass).strip()
else:
creds = None
return creds, proxy_url
def process_request(self, request, spider):
# ignore if proxy is already seted
if 'proxy' in request.meta:
return
parsed = urlparse_cached(request)
scheme = parsed.scheme
# 'no_proxy' is only supported by http schemes
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
return
if scheme in self.proxies:
self._set_proxy(request, scheme)
def _set_proxy(self, request, scheme):
creds, proxy = self.proxies[scheme]
request.meta['proxy'] = proxy
if creds:
request.headers['Proxy-Authorization'] = 'Basic ' + creds
Далее выпишем, что мы импортируем из urllib, urllib2 ... Очевидно, эти библиотеки сильно меняются, в третьей ветсии вместо них уже другие (см. ссылку ниже), кроме того, я уже не смогн найти proxy_bypass, _parse_proxy в текущей документации
import urllib
urllib.getproxies()
This helper function returns a dictionary of scheme to proxy server URL mappings.
It scans the environment for variables named <scheme>_proxy, in case insensitive way, for all operating systems first,
and when it cannot find it, looks for proxy information from Mac OSX System Configuration for Mac OS X and
Windows Systems Registry for Windows.
help(urllib.getproxies)
urllib.getproxies()
Чего проще, "просканировать" переменные окружения, а как залезть в системные переменные? Вот фрагмент из urllib¶
def getproxies_environment():
"""Здесь справка, которую я уже распечатал выше, потому здесь убрал
"""
proxies = {}
for name, value in os.environ.items():
name = name.lower()
if value and name[-6:] == '_proxy':
proxies[name[:-6]] = value
return proxies
В дальнейшем надо бы собрать вместе возможности командной строки (!set ...) и os.environ. Для ручного переключения прокси, например.
from os import environ
environ.items()
help(environ)
FancyURLopener subclasses URLopener
providing default handling for the following HTTP response codes: 301, 302, 303, 307 and 401.
For the 30x response codes listed above, the Location header is used to fetch the actual URL.
For 401 response codes (authentication required), basic HTTP authentication is performed.
For the 30x response codes, recursion is bounded by the value of the maxtries attribute, which defaults to 10.
For all other response codes, the method http_error_default() is called which you can override in subclasses to handle
the error appropriately.
Note According to the letter of RFC 2616, 301 and 302 responses to POST requests must not be automatically redirected
without confirmation by the user. In reality, browsers do allow automatic redirection of these responses, changing
the POST to a GET, and urllib reproduces this behaviour.
The parameters to the constructor are the same as those for URLopener.
urllib.unquote(string)
Replace %xx escapes by their single-character equivalent.
Example: unquote('/%7Econnolly/') yields '/~connolly/'.
print urllib.unquote('/%7Econnolly/')
help(urllib.proxy_bypass)
import urllib2
help(urllib2._parse_proxy)
import urlparse
from scrapy.utils.httpobj import urlparse_cached
help(urlparse_cached)
help(urlparse.urlunparse)
from scrapy.exceptions import NotConfigured
help(NotConfigured)
from scrapy.exceptions import NotConfigured
help(NotConfigured)
[Hypertext Transfer Protocol -- HTTP/1.1 rfc2616](http://tools.ietf.org/html/rfc2616.html)
10.2 Successful 2xx ..............................................58
10.2.1 200 OK ...................................................58
10.2.2 201 Created ..............................................59
10.2.3 202 Accepted .............................................59
10.2.4 203 Non-Authoritative Information ........................59
10.2.5 204 No Content ...........................................60
10.2.6 205 Reset Content ........................................60
10.2.7 206 Partial Content ......................................60
10.3 Redirection 3xx .............................................61
10.3.1 300 Multiple Choices .....................................61
10.3.2 301 Moved Permanently ....................................62
10.3.3 302 Found ................................................62
10.3.4 303 See Other ............................................63
10.3.5 304 Not Modified .........................................63
10.3.6 305 Use Proxy ............................................64
10.3.7 306 (Unused) .............................................64
10.3.8 307 Temporary Redirect ...................................65
10.4 Client Error 4xx ............................................65
10.4.1 400 Bad Request .........................................65
10.4.2 401 Unauthorized ........................................66
10.4.3 402 Payment Required ....................................66
10.4.4 403 Forbidden ...........................................66
10.4.5 404 Not Found ...........................................66
10.4.6 405 Method Not Allowed ..................................66
10.4.7 406 Not Acceptable ......................................67
10.4.8 407 Proxy Authentication Required .......................67
10.4.9 408 Request Timeout .....................................67
10.4.10 409 Conflict ............................................67
10.4.11 410 Gone ................................................68
10.4.12 411 Length Required .....................................68
10.4.13 412 Precondition Failed .................................68
10.4.14 413 Request Entity Too Large ............................69
10.4.15 414 Request-URI Too Long ................................69
10.4.16 415 Unsupported Media Type ..............................69
10.4.17 416 Requested Range Not Satisfiable .....................69
10.4.18 417 Expectation Failed ..................................70
10.5 Server Error 5xx ............................................70
10.5.1 500 Internal Server Error ................................70
10.5.2 501 Not Implemented ......................................70
10.5.3 502 Bad Gateway ..........................................70
10.5.4 503 Service Unavailable ..................................70
10.5.5 504 Gateway Timeout ......................................71
10.5.6 505 HTTP Version Not Supported ...........................71
Посты чуть ниже также могут вас заинтересовать
Комментариев нет:
Отправить комментарий