#!/usr/bin/env python3
# This is a script 1 anon from 4chan wrote, and which I modified. The script
# will download all Microsoft patents. There are somewhere around 45500
# URLs / PDFs, so I set the range from 0 to 100 000, because it'll just finish
# and when you notice it hasn't created new files in a while, then you know the
# script is done fetching all the patents. If a URL can't be fetched, then the
# script will retry connecting 10 times, with a 10 second interval. After that
# the script gives up on that particular URL, and moves on to the next.

import requests
from requests import get
import bs4
import re
import time
import os

def download(url, file_name):
    if os.path.isfile(file_name):
        print('skipping ' + file_name + '...')
        return(True)
    connected = False
    tries_2 = 0
    while not connected:
         try:
             response = get(url)
             with open(file_name, 'wb') as file:
                 file.write(response.content)
             print('downloaded ' + str(num) + '\r\n')
             connected = True
         except:
             tries_2 = tries_2 + 1
             if tries_2 == 11:
                 connected = True
             else:
                 print('retrying ' + str(tries_2) + ' ' + url + '...')
                 time.sleep(10)
             pass

for num in range(0, 100000):
    start_url = "http://patft.usb.com/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=" + str(num) + "&f=G&l=50&d=PTXT&p=1&S1=microsoft.ASNM.&OS=AN/microsoft&RS=AN/microsoft"
    headers = {'User-Agent':'HTC Mozilla/5.0 (Linux; Android 7.0; HTC 10 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.83 Mobile Safari/537.36.'}

    connected = False
    tries = 0
    while not connected:
        try:
            content = requests.get(start_url,headers=headers)
            soup = bs4.BeautifulSoup(content.text,features='lxml')
            for t in soup.find_all('a', href=lambda x: x and 'View+first+page' in x):
                step2 = t['href'] + '&pagenum=0'
            sec_content = requests.get(step2)
            soup2 = bs4.BeautifulSoup(sec_content.text,features='lxml')
            download('http:' + soup2.find('embed').get('src'), soup2.find('embed').get('name') + '.pdf')
            connected = True
        except:
            tries = tries + 1
            if tries == 11:
                connected = True
            else:
                print('retrying ' + str(tries) + ' ' + start_url + '...')
                time.sleep(10)
            pass
  • 네이버 블러그 공유하기
  • 네이버 밴드에 공유하기
  • 페이스북 공유하기
  • 카카오스토리 공유하기

댓글을 달아 주세요

TistoryWhaleSkin3.4">