logo
Loading...

asyncio, aiohttp 在 fetch函數中插入迴圈 - Cupoy

我想要用產品條碼(UPC)查詢是什麼商品,為了避免被ban,我在加入了proxy的轉換。code 如...

pycrawler

asyncio, aiohttp 在 fetch函數中插入迴圈

2020/04/05 03:05 上午
Python網路爬蟲討論版
Chi-Kang Su
觀看數:3
回答數:2
收藏數:1
pycrawler

我想要用產品條碼(UPC)查詢是什麼商品,為了避免被ban,我在加入了proxy的轉換。code 如下:


## Get proxies from free proxy website
def catchProxyList():
    proxy_ips = []
    res = requests.get(r'https://free-proxy-list.net/', verify = False)
    soup = BeautifulSoup(res.text)

    for tr_item in soup.find('tbody').find_all('tr'):
        ip = ':'.join([tr_item.find_all('td')[0].text, tr_item.find_all('td')[1].text])
        proxy_ips.append(ip)
    return proxy_ips
proxy_ips = catchProxyList()
proxy_ips[:5]
## There are many dead proxies in the proxy list. We need to check validation on it.
async def checkProxyValid(proxy, session):
    try:
        resp = session.get('http://icanhazip.com',
                            proxies={'http': 'http://jude:xxxxx@{}'.format(proxy),
                                  'https': 'http://jude:xxxxx@{}'.format(proxy)}, 
                            timeout=3)
        if re.findall('\d*.\d*.\d*.\d*', resp.text)[0] == re.findall('\d*.\d*.\d*.\d*', proxy)[0]:
            return True
        else:
            return False
    except:
        return False
        
     
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}

async def crawlPrice(upc, session, headers = headers):
    global proxy_ips
    
    fromData = {
        'method': 'upc',
        'sku': '',
        'upc': upc,
        'zip': '',
        'sort': 'recommended'
    }
    
    # Find a valid IP
    if len(proxy_ips) == 0:
        proxy_ips = catchProxyList()
        print('reset proxies')
        
    ip = random.choice(proxy_ips)

    while not await checkProxyValid(ip, session):
        proxy_ips.remove(ip)
        print('remove ip: {}'.format(ip))
        if not proxy_ips:
            proxy_ips = catchProxyList()
            print('reset proxies')
            
        ip = random.choice(proxy_ips)
    
    print('find a valid IP: {}'.format(ip))
    try:
        resp = await session.post('https://brickseek.com/walmart-inventory-checker/', 
                         data = fromData,
                         headers = headers,
                         proxies={'http': 'http://jude:xxxxx@{}'.format(ip),
                                  'https': 'http://jude:xxxxx@{}'.format(ip)},
                         timeout= 5 
                            )
    except:
        print('fail to get price')
        return crawlPrice(upc)
    
    soup = BeautifulSoup(resp.text)
    return {upc: soup.find('strong', string = 'MSRP:').find_next_sibling().get_text()}
nest_asyncio.apply()

upcList = set(train['Upc_check_number']).union(set(test['Upc_check_number']))

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [loop.create_task(crawlPrice(upc, session)) for upc in upcList]
        finished, unfinished = await asyncio.wait(tasks)
        upc_result = [f.result() for f in finished]
            

loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()


但是在執行時,就落入用來判斷proxy是否可用的while迴圈中跑不出來了(如下)。但是我確定裡面是有可用proxy的(有單獨用程式測試過)。詳情問是哪裡出了問題呢?再麻煩解惑了,謝謝!


remove ip: 165.227.215.62:8080
remove ip: 158.69.183.122:8080
remove ip: 91.109.198.48:32062
remove ip: 96.9.80.62:45557
remove ip: 91.187.75.48:42296
remove ip: 134.209.29.65:8118
remove ip: 31.209.97.66:57482
remove ip: 217.69.3.4:8080
remove ip: 13.112.20.22:8080
remove ip: 201.64.22.51:80
remove ip: 170.247.152.132:43747
remove ip: 168.194.14.126:8080
remove ip: 207.154.231.213:8080
remove ip: 195.4.164.127:8080