InterestRate/python/downloaderScript.py at master · evan01/InterestRate

History

198 lines (177 loc) · 6.74 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

'''

This is a script that will download images belonging to a specific keyword to a directory

'''

To use this module you MUST have python3 installed on your system as the urllib doesn't work when you run it with

python2.

To use it run the imageDowloader.py from the command line, args are...

-s SEARCH STRING

-f PATH/TO/SEARCH/TEXTFILE.txt (where each line contains a new search string)

When you run it the dowloads will go to the output_dir specified in code below

'''

#!/usr/bin/env python3

import argparse

import atexit

import hashlib

import os

import pickle

import posixpath

import random

import re

import signal

import socket

import threading

import time

import urllib.parse

import urllib.request

# config

output_dir = './DOWNLOADED_IMAGES' # default output dir

adult_filter = True # Do not disable adult filter by default

pool_sema = threading.BoundedSemaphore(value=20) # max number of download threads

bingcount = 35 # default bing paging

socket.setdefaulttimeout(2)

in_progress = []

tried_urls = []

finished_keywords = []

failed_urls = []

image_md5s = {}

urlopenheader = {'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0'}

def download(url, output_dir, retry=False):

global tried_urls, failed_urls

url_hash = hashlib.sha224(url.encode('utf-8')).digest()

if url_hash in tried_urls:

return

pool_sema.acquire()

path = urllib.parse.urlsplit(url).path

filename = posixpath.basename(path)

if len(filename) > 40:

filename = filename[:36] + filename[-4:]

while os.path.exists(output_dir + '/' + filename):

filename = str(random.randint(0, 100)) + filename

in_progress.append(filename)

try:

request = urllib.request.Request(url, None, urlopenheader)

image = urllib.request.urlopen(request).read()

if len(image) == 0:

print('no image')

md5 = hashlib.md5()

md5.update(image)

md5_key = md5.hexdigest()

if md5_key in image_md5s:

print('FAIL Image is a duplicate of ' + image_md5s[md5_key] + ', not saving ' + filename)

image_md5s[md5_key] = filename

imagefile = open(output_dir + '/' + filename, 'wb')

imagefile.write(image)

imagefile.close()

in_progress.remove(filename)

if retry:

print('Retry OK ' + filename)

else:

print("OK " + filename)

tried_urls.append(url_hash)

except Exception as e:

if retry:

print('Retry Fail ' + filename)

else:

print("FAIL " + filename)

failed_urls.append((url, output_dir))

pool_sema.release()

def removeNotFinished():

for filename in in_progress:

try:

os.remove(output_dir + '/' + filename)

except FileNotFoundError:

pass

def fetch_images_from_keyword(keyword, output_dir):

current = 1

last = ''

index = 0

while index < 2000:

request_url = 'https://www.bing.com/images/async?q=' + urllib.parse.quote_plus(

keyword) + '&async=content&first=' + str(current) + '&adlt=' + adlt

request = urllib.request.Request(request_url, None, headers=urlopenheader)

response = urllib.request.urlopen(request)

html = response.read().decode('utf8')

links = re.findall('imgurl:"(.*?)"', html)

try:

if links[-1] == last:

break

last = links[-1]

current += bingcount

for link in links:

t = threading.Thread(target=download, args=(link, output_dir))

t.start()

index += 1

except IndexError:

print('No search results for "{0}"'.format(keyword))

return False

time.sleep(0.1)

return True

def backup_history(*args):

download_history = open(output_dir + '/download_history.pickle', 'wb')

pickle.dump(tried_urls, download_history)

pickle.dump(finished_keywords, download_history)

pickle.dump(image_md5s, download_history)

download_history.close()

print('history_dumped')

if args:

exit(0)

if __name__ == "__main__":

atexit.register(removeNotFinished)

parser = argparse.ArgumentParser(description='Bing image bulk downloader')

parser.add_argument('-s', '--search-string', help='Keyword to search', required=False)

parser.add_argument('-f', '--search-file', help='Path to a file containing search strings line by line',

required=False)

parser.add_argument('-o', '--output', help='Output directory', required=False)

parser.add_argument('--filter', help='Enable adult filter', action="/?originalUrl=https%3A%2F%2Fgithub.com%2F%26%23039%3Bstore_true%26%23039%3B%2C%2520required%3DFalse)%253C%2Fdiv">

parser.add_argument('--no-filter', help='Disable adult filter', action="/?originalUrl=https%3A%2F%2Fgithub.com%2F%26%23039%3Bstore_true%26%23039%3B%2C%2520required%3DFalse)%253C%2Fdiv">

args = parser.parse_args()

if (not args.search_string) and (not args.search_file):

parser.error('Provide Either search string or path to file containing search strings')

if args.output:

output_dir = args.output

if not os.path.exists(output_dir):

os.makedirs(output_dir)

output_dir_origin = output_dir

signal.signal(signal.SIGINT, backup_history)

try:

download_history = open(output_dir + '/download_history.pickle', 'rb')

tried_urls = pickle.load(download_history)

finished_keywords = pickle.load(download_history)

image_md5s = pickle.load(download_history)

download_history.close()

except (OSError, IOError):

tried_urls = []

if adult_filter:

adlt = ''

else:

adlt = 'off'

if args.no_filter:

adlt = 'off'

elif args.filter:

adlt = ''

if args.search_string:

keyword = args.search_string

fetch_images_from_keyword(args.search_string, output_dir)

elif args.search_file:

try:

inputFile = open(args.search_file)

except (OSError, IOError):

print("Couldn't open file {}".format(args.search_file))

exit(1)

for keyword in inputFile.readlines():

keyword_hash = hashlib.sha224(keyword.strip().encode('utf-8')).digest()

if keyword_hash in finished_keywords:

print('"{0}" Already downloaded'.format(keyword.strip()))

continue

output_dir = output_dir_origin + '/' + keyword.strip().replace(' ', '_')

if not os.path.exists(output_dir):

os.makedirs(output_dir)

if fetch_images_from_keyword(keyword, output_dir):

finished_keywords.append(keyword_hash)

for failed_url in failed_urls:

t = threading.Thread(target=download, args=(failed_url[0], failed_url[1], True))

t.start()

failed_urls = []

backup_history()

inputFile.close()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

downloaderScript.py

Latest commit

History

downloaderScript.py

File metadata and controls