3 # Script to generate distorted text images for a captcha system.
5 # Copyright (C) 2005 Neil Harris
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License along
18 # with this program; if not, write to the Free Software Foundation, Inc.,
19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 # http://www.gnu.org/copyleft/gpl.html
22 # Further tweaks by Brion Vibber <brion@pobox.com>:
23 # 2006-01-26: Add command-line options for the various parameters
24 # 2007-02-19: Add --dirs param for hash subdirectory splits
25 # Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
26 # 2008-01-06: Add regex check to skip words containing other than a-z
31 from optparse import OptionParser
35 import multiprocessing
40 from PIL import ImageFont
41 from PIL import ImageDraw
42 from PIL import ImageEnhance
43 from PIL import ImageOps
44 from PIL import ImageMath
46 sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")
48 nonalpha = re.compile('[^a-z]') # regex to test for suitability of words
50 # Does X-axis wobbly copy, sandwiched between two rotates
51 def wobbly_copy(src, wob, col, scale, ang):
53 f = random.uniform(4*scale, 5*scale)
54 p = random.uniform(0, math.pi*2)
55 rr = ang+random.uniform(-10, 10) # vary, but not too much
56 int_d = Image.new('RGB', src.size, 0) # a black rectangle
57 rot = src.rotate(rr, Image.BILINEAR)
58 # Do a cheap bounding-box op here to try to limit work below
64 # and only do lines with content on
65 for i in range(t, b+1):
67 xoff = int(math.sin(p+(i*f/y))*wob)
68 xoff += int(random.uniform(-wob*0.5, wob*0.5))
69 int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
70 # try to stop blurring from building up
71 int_d = int_d.rotate(-rr, Image.BILINEAR)
72 enh = ImageEnhance.Sharpness(int_d)
76 def gen_captcha(text, fontname, fontsize, file_name):
77 """Generate a captcha image"""
78 # white text on a black background
81 # create a font object
82 font = ImageFont.truetype(fontname,fontsize)
83 # determine dimensions of the text
84 dim = font.getsize(text)
85 # create a new image significantly larger that the text
86 edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
87 im = Image.new('RGB', (edge, edge), bgcolor)
88 d = ImageDraw.Draw(im)
90 # add the text to the image
91 d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
95 # Apply lots of small stirring operations, rather than a few large ones
96 # in order to get some uniformity of treatment, whilst
97 # maintaining randomness
99 im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
100 im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
101 im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
104 # now get the bounding box of the nonzero parts of the image
106 bord = min(dim[0], dim[1])/4 # a bit of a border
107 im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
111 nsize = (im.size[0] / nblock, im.size[1] / nblock)
112 noise = Image.new('L', nsize, bgcolor)
114 for x in range(nsize[0]):
115 for y in range(nsize[1]):
116 r = random.randint(0, 65)
117 gradient = 70 * x / nsize[0]
118 data[x, y] = r + gradient
119 # Turn speckles into blobs
120 noise = noise.resize(im.size, Image.BILINEAR)
122 im = ImageMath.eval('convert(convert(a, "L") / 3 + b, "RGB")', a=im, b=noise)
124 # and turn into black on white
125 im = ImageOps.invert(im)
127 # save the image, in format determined from filename
130 def gen_subdir(basedir, md5hash, levels):
131 """Generate a subdirectory path out of the first _levels_
132 characters of _hash_, and ensure the directories exist
135 for i in range(0, levels):
138 subdir = os.path.join(subdir, char)
141 fulldir = os.path.join(basedir, subdir)
142 if not os.path.exists(fulldir):
146 def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
147 if words is not None:
148 word = words[random.randint(0,len(words)-1)]
150 word2 = words[random.randint(0,len(words)-1)]
155 max_length = max_length if max_length > 0 else 10
156 for i in range(0, random.randint(min_length, max_length)):
157 word = word + chr(97 + random.randint(0,25))
160 print("word is %s" % word)
162 if len(word) < min_length:
164 print("skipping word pair '%s' because it has fewer than %d characters" % (word, min_length))
167 if max_length > 0 and len(word) > max_length:
169 print("skipping word pair '%s' because it has more than %d characters" % (word, max_length))
172 if nonalpha.search(word):
174 print("skipping word pair '%s' because it contains non-alphabetic characters" % word)
177 for naughty in blacklist:
180 print("skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty))
184 def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
185 for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
186 word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
189 sys.exit("Unable to find valid word combinations")
191 def read_wordlist(filename):
193 words = [x.strip().lower() for x in f.readlines()]
197 def run_in_thread(object):
200 blacklist = object[2]
205 for i in range(count):
206 word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
207 salt = "%08x" % random.randrange(2**32)
208 # 64 bits of hash is plenty for this purpose
209 md5hash = hashlib.md5((key+salt+word+key+salt).encode('utf-8')).hexdigest()[:16]
210 filename = "image_%s_%s.png" % (salt, md5hash)
212 subdir = gen_subdir(output, md5hash, dirs)
213 filename = os.path.join(subdir, filename)
216 gen_captcha(word, font, fontsize, os.path.join(output, filename))
218 if __name__ == '__main__':
219 """This grabs random words from the dictionary 'words' (one
220 word per line) and generates a captcha image for each one,
221 with a keyed salted hash of the correct answer in the filename.
223 To check a reply, hash it in the same way with the same salt and
224 secret key, then compare with the hash value given.
226 script_dir = os.path.dirname(os.path.realpath(__file__))
227 parser = OptionParser()
228 parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
229 parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
230 parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
231 parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
232 parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
233 parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
234 parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
235 parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
236 parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
237 parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
238 parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
239 parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
240 parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
241 parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
242 parser.add_option("--threads", help="Maximum number of threads to be used to generate captchas.", type='int', default=1)
244 opts, args = parser.parse_args()
247 wordlist = opts.wordlist
251 sys.exit("Need to specify a wordlist")
255 sys.exit("Need to specify a key")
259 sys.exit("Need to specify an output directory")
260 if opts.font and os.path.exists(opts.font):
263 sys.exit("Need to specify the location of a font")
265 blacklist = read_wordlist(opts.blacklist)
269 verbose = opts.verbose
270 fontsize = opts.font_size
271 threads = opts.threads
274 count = max(0, fill - len(os.listdir(output)))
278 words = read_wordlist(wordlist)
279 words = [x for x in words
280 if len(x) in (4,5) and x[0] != "f"
281 and x[0] != x[1] and x[-1] != x[-2]]
284 sys.exit("No need to generate CAPTCHA images.")
290 chunks = (count / threads)
292 p = multiprocessing.Pool(threads);
294 print("Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..." % (count, chunks, threads))
295 for i in range(0, threads):
296 data.append([chunks, words, blacklist, opts, font, fontsize])
298 p.map(run_in_thread, data)