파일 다운로드 예제

#!/usr/bin/env python3
# This is a script 1 anon from 4chan wrote, and which I modified. The script
# will download all Microsoft patents. There are somewhere around 45500
# URLs / PDFs, so I set the range from 0 to 100 000, because it'll just finish
# and when you notice it hasn't created new files in a while, then you know the
# script is done fetching all the patents. If a URL can't be fetched, then the
# script will retry connecting 10 times, with a 10 second interval. After that
# the script gives up on that particular URL, and moves on to the next.

import requests
from requests import get
import bs4
import re
import time
import os

def download(url, file_name):
    if os.path.isfile(file_name):
        print('skipping ' + file_name + '...')
        return(True)
    connected = False
    tries_2 = 0
    while not connected:
         try:
             response = get(url)
             with open(file_name, 'wb') as file:
                 file.write(response.content)
             print('downloaded ' + str(num) + '\r\n')
             connected = True
         except:
             tries_2 = tries_2 + 1
             if tries_2 == 11:
                 connected = True
             else:
                 print('retrying ' + str(tries_2) + ' ' + url + '...')
                 time.sleep(10)
             pass

for num in range(0, 100000):
    start_url = "http://patft.usb.com/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=" + str(num) + "&f=G&l=50&d=PTXT&p=1&S1=microsoft.ASNM.&OS=AN/microsoft&RS=AN/microsoft"
    headers = {'User-Agent':'HTC Mozilla/5.0 (Linux; Android 7.0; HTC 10 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.83 Mobile Safari/537.36.'}

    connected = False
    tries = 0
    while not connected:
        try:
            content = requests.get(start_url,headers=headers)
            soup = bs4.BeautifulSoup(content.text,features='lxml')
            for t in soup.find_all('a', href=lambda x: x and 'View+first+page' in x):
                step2 = t['href'] + '&pagenum=0'
            sec_content = requests.get(step2)
            soup2 = bs4.BeautifulSoup(sec_content.text,features='lxml')
            download('http:' + soup2.find('embed').get('src'), soup2.find('embed').get('name') + '.pdf')
            connected = True
        except:
            tries = tries + 1
            if tries == 11:
                connected = True
            else:
                print('retrying ' + str(tries) + ' ' + start_url + '...')
                time.sleep(10)
            pass

저작자표시 비영리 변경금지

파일 다운로드 예제

공지사항

전체 카테고리

태그

전체 방문자

블로그 인기글

티스토리툴바

공지사항

전체 카테고리

최근 글

최근 댓글

태그

전체 방문자

블로그 인기글

티스토리툴바