Batch-Downloading from Wikimedia servers (2)
Submitted by Christoph on 17 December, 2008 - 08:15
Some time ago I wrote how to download a category of files from Wikipedia. As the API got updates and my program now can download more than the maximum page size of 500 entries, I'll repost the script:
#!/usr/bin/python
# -*- coding: utf8 -*-
#
# Christoph Burgmer, 2008
# Released unter the MIT License.
#
import urllib
import sys
import re
import os
prependURL = "http://commons.wikimedia.org/w/api.php" \
+ "?action=query&prop=imageinfo&iiprop=url&format=xml&titles="
maxFiles = 500
class AppURLopener(urllib.FancyURLopener):
version="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
urllib._urlopener = AppURLopener()
cat = urllib.quote(sys.argv[1].replace('Category:', ''))
baseUrl = "http://commons.wikimedia.org/w/api.php" \
+ "?action=query&list=categorymembers&cmtitle=Category:" \
+ cat + "&cmnamespace=6&format=xml&cmlimit=" + str(maxFiles)
print "getting cat", cat, "(maximum "+ str(maxFiles) + ")"
continueRegex = re.compile('<query-continue>' \
+ '<categorymembers cmcontinue="([^\>"]+)" />' + '</query-continue>')
continueParam = None
while True:
if continueParam:
url = baseUrl + '&cmcontinue=' + urllib.quote(continueParam)
else:
url = baseUrl
print "retrieving category page url", url
f = urllib.urlopen(url)
content = f.read()
for imageName in re.findall(r'<cm[^>]+title="([^\>"]+)" />', content):
imageDescriptionUrl = prependURL + imageName
matchObj = re.search("File:([^/]+)$", imageName)
if matchObj:
fileName = matchObj.group(1).strip("\n")
if os.path.exists(fileName):
print "skipping", fileName
else:
print "getting file description page", imageName
d = urllib.urlopen(imageDescriptionUrl)
matchObj = re.search('<ii[^>]*?url="([^\>"]+)[^>]*>', d.read())
if matchObj:
fileUrl = matchObj.group(1)
print "getting", fileName, fileUrl
urllib.urlretrieve(fileUrl, fileName)
matchObj = continueRegex.search(content)
if matchObj:
continueParam = matchObj.group(1)
else:
break

http://commons.wikimedia.org/
http://commons.wikimedia.org/wiki/Category:Order.gif_stroke_order_images 362 files :)
http://commons.wikimedia.org/w/index.php?title=Commons:Stroke_Order_Project#Finding%20characters