extensions/ConfirmEdit/captcha.py

   1 #!/usr/bin/python
   2 #
   3 # Script to generate distorted text images for a captcha system.
   4 #
   5 # Copyright (C) 2005 Neil Harris
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License along
  18 # with this program; if not, write to the Free Software Foundation, Inc.,
  19 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20 # http://www.gnu.org/copyleft/gpl.html
  21 #
  22 # Further tweaks by Brion Vibber <brion@pobox.com>:
  23 # 2006-01-26: Add command-line options for the various parameters
  24 # 2007-02-19: Add --dirs param for hash subdirectory splits
  25 # Tweaks by Greg Sabino Mullane <greg@turnstep.com>:
  26 # 2008-01-06: Add regex check to skip words containing other than a-z
  27
  28 import random
  29 import math
  30 import hashlib
  31 from optparse import OptionParser
  32 import os
  33 import sys
  34 import re
  35 import multiprocessing
  36 import time
  37
  38 try:
  39         from PIL import Image
  40         from PIL import ImageFont
  41         from PIL import ImageDraw
  42         from PIL import ImageEnhance
  43         from PIL import ImageOps
  44         from PIL import ImageMath
  45 except:
  46         sys.exit("This script requires the Python Imaging Library - http://www.pythonware.com/products/pil/")
  47
  48 nonalpha = re.compile('[^a-z]') # regex to test for suitability of words
  49
  50 # Does X-axis wobbly copy, sandwiched between two rotates
  51 def wobbly_copy(src, wob, col, scale, ang):
  52         x, y = src.size
  53         f = random.uniform(4*scale, 5*scale)
  54         p = random.uniform(0, math.pi*2)
  55         rr = ang+random.uniform(-10, 10) # vary, but not too much
  56         int_d = Image.new('RGB', src.size, 0) # a black rectangle
  57         rot = src.rotate(rr, Image.BILINEAR)
  58         # Do a cheap bounding-box op here to try to limit work below
  59         bbx = rot.getbbox()
  60         if bbx == None:
  61                 return src
  62         else:
  63                 l, t, r, b= bbx
  64         # and only do lines with content on
  65         for i in range(t, b+1):
  66                 # Drop a scan line in
  67                 xoff = int(math.sin(p+(i*f/y))*wob)
  68                 xoff += int(random.uniform(-wob*0.5, wob*0.5))
  69                 int_d.paste(rot.crop((0, i, x, i+1)), (xoff, i))
  70         # try to stop blurring from building up
  71         int_d = int_d.rotate(-rr, Image.BILINEAR)
  72         enh = ImageEnhance.Sharpness(int_d)
  73         return enh.enhance(2)
  74
  75
  76 def gen_captcha(text, fontname, fontsize, file_name):
  77         """Generate a captcha image"""
  78         # white text on a black background
  79         bgcolor = 0x0
  80         fgcolor = 0xffffff
  81         # create a font object
  82         font = ImageFont.truetype(fontname,fontsize)
  83         # determine dimensions of the text
  84         dim = font.getsize(text)
  85         # create a new image significantly larger that the text
  86         edge = max(dim[0], dim[1]) + 2*min(dim[0], dim[1])
  87         im = Image.new('RGB', (edge, edge), bgcolor)
  88         d = ImageDraw.Draw(im)
  89         x, y = im.size
  90         # add the text to the image
  91         d.text((x/2-dim[0]/2, y/2-dim[1]/2), text, font=font, fill=fgcolor)
  92         k = 2
  93         wob = 0.09*dim[1]
  94         rot = 45
  95         # Apply lots of small stirring operations, rather than a few large ones
  96         # in order to get some uniformity of treatment, whilst
  97         # maintaining randomness
  98         for i in range(k):
  99                 im = wobbly_copy(im, wob, bgcolor, i*2+3, rot+0)
 100                 im = wobbly_copy(im, wob, bgcolor, i*2+1, rot+45)
 101                 im = wobbly_copy(im, wob, bgcolor, i*2+2, rot+90)
 102                 rot += 30
 103
 104         # now get the bounding box of the nonzero parts of the image
 105         bbox = im.getbbox()
 106         bord = min(dim[0], dim[1])/4 # a bit of a border
 107         im = im.crop((bbox[0]-bord, bbox[1]-bord, bbox[2]+bord, bbox[3]+bord))
 108
 109         # Create noise
 110         nblock = 4
 111         nsize = (im.size[0] / nblock, im.size[1] / nblock)
 112         noise = Image.new('L', nsize, bgcolor)
 113         data = noise.load()
 114         for x in range(nsize[0]):
 115                 for y in range(nsize[1]):
 116                         r = random.randint(0, 65)
 117                         gradient = 70 * x / nsize[0]
 118                         data[x, y] = r + gradient
 119         # Turn speckles into blobs
 120         noise = noise.resize(im.size, Image.BILINEAR)
 121         # Add to the image
 122         im = ImageMath.eval('convert(convert(a, "L") / 3 + b, "RGB")', a=im, b=noise)
 123
 124         # and turn into black on white
 125         im = ImageOps.invert(im)
 126
 127         # save the image, in format determined from filename
 128         im.save(file_name)
 129
 130 def gen_subdir(basedir, md5hash, levels):
 131         """Generate a subdirectory path out of the first _levels_
 132         characters of _hash_, and ensure the directories exist
 133         under _basedir_."""
 134         subdir = None
 135         for i in range(0, levels):
 136                 char = md5hash[i]
 137                 if subdir:
 138                         subdir = os.path.join(subdir, char)
 139                 else:
 140                         subdir = char
 141                 fulldir = os.path.join(basedir, subdir)
 142                 if not os.path.exists(fulldir):
 143                         os.mkdir(fulldir)
 144         return subdir
 145
 146 def try_pick_word(words, blacklist, verbose, nwords, min_length, max_length):
 147         if words is not None:
 148                 word = words[random.randint(0,len(words)-1)]
 149                 while nwords > 1:
 150                         word2 = words[random.randint(0,len(words)-1)]
 151                         word = word + word2
 152                         nwords = nwords - 1
 153         else:
 154                 word = ''
 155                 max_length = max_length if max_length > 0 else 10
 156                 for i in range(0, random.randint(min_length, max_length)):
 157                         word = word + chr(97 + random.randint(0,25))
 158
 159         if verbose:
 160                 print("word is %s" % word)
 161
 162         if len(word) < min_length:
 163                 if verbose:
 164                         print("skipping word pair '%s' because it has fewer than %d characters" % (word, min_length))
 165                 return None
 166
 167         if max_length > 0 and len(word) > max_length:
 168                 if verbose:
 169                         print("skipping word pair '%s' because it has more than %d characters" % (word, max_length))
 170                 return None
 171
 172         if nonalpha.search(word):
 173                 if verbose:
 174                         print("skipping word pair '%s' because it contains non-alphabetic characters" % word)
 175                 return None
 176
 177         for naughty in blacklist:
 178                 if naughty in word:
 179                         if verbose:
 180                                 print("skipping word pair '%s' because it contains blacklisted word '%s'" % (word, naughty))
 181                         return None
 182         return word
 183
 184 def pick_word(words, blacklist, verbose, nwords, min_length, max_length):
 185         for x in range(1000): # If we can't find a valid combination in 1000 tries, just give up
 186                 word = try_pick_word(words, blacklist, verbose, nwords, min_length, max_length)
 187                 if word:
 188                         return word
 189         sys.exit("Unable to find valid word combinations")
 190
 191 def read_wordlist(filename):
 192         f = open(filename)
 193         words = [x.strip().lower() for x in f.readlines()]
 194         f.close()
 195         return words
 196
 197 def run_in_thread(object):
 198         count = object[0];
 199         words = object[1]
 200         blacklist = object[2]
 201         opts = object[3]
 202         font = object[4]
 203         fontsize = object[5]
 204
 205         for i in range(count):
 206                 word = pick_word(words, blacklist, verbose, opts.number_words, opts.min_length, opts.max_length)
 207                 salt = "%08x" % random.randrange(2**32)
 208                 # 64 bits of hash is plenty for this purpose
 209                 md5hash = hashlib.md5((key+salt+word+key+salt).encode('utf-8')).hexdigest()[:16]
 210                 filename = "image_%s_%s.png" % (salt, md5hash)
 211                 if dirs:
 212                         subdir = gen_subdir(output, md5hash, dirs)
 213                         filename = os.path.join(subdir, filename)
 214                 if verbose:
 215                         print(filename)
 216                 gen_captcha(word, font, fontsize, os.path.join(output, filename))
 217
 218 if __name__ == '__main__':
 219         """This grabs random words from the dictionary 'words' (one
 220         word per line) and generates a captcha image for each one,
 221         with a keyed salted hash of the correct answer in the filename.
 222
 223         To check a reply, hash it in the same way with the same salt and
 224         secret key, then compare with the hash value given.
 225         """
 226         script_dir = os.path.dirname(os.path.realpath(__file__))
 227         parser = OptionParser()
 228         parser.add_option("--wordlist", help="A list of words (required)", metavar="WORDS.txt")
 229         parser.add_option("--random", help="Use random charcters instead of a wordlist", action="store_true")
 230         parser.add_option("--key", help="The passphrase set as $wgCaptchaSecret (required)", metavar="KEY")
 231         parser.add_option("--output", help="The directory to put the images in - $wgCaptchaDirectory (required)", metavar="DIR")
 232         parser.add_option("--font", help="The font to use (required)", metavar="FONT.ttf")
 233         parser.add_option("--font-size", help="The font size (default 40)", metavar="N", type='int', default=40)
 234         parser.add_option("--count", help="The maximum number of images to make (default 20)", metavar="N", type='int', default=20)
 235         parser.add_option("--blacklist", help="A blacklist of words that should not be used", metavar="FILE", default=os.path.join(script_dir, "blacklist"))
 236         parser.add_option("--fill", help="Fill the output directory to contain N files, overrides count, cannot be used with --dirs", metavar="N", type='int')
 237         parser.add_option("--dirs", help="Put the images into subdirectories N levels deep - $wgCaptchaDirectoryLevels", metavar="N", type='int')
 238         parser.add_option("--verbose", "-v", help="Show debugging information", action='store_true')
 239         parser.add_option("--number-words", help="Number of words from the wordlist which make a captcha challenge (default 2)", type='int', default=2)
 240         parser.add_option("--min-length", help="Minimum length for a captcha challenge", type='int', default=1)
 241         parser.add_option("--max-length", help="Maximum length for a captcha challenge", type='int', default=-1)
 242         parser.add_option("--threads", help="Maximum number of threads to be used to generate captchas.", type='int', default=1)
 243
 244         opts, args = parser.parse_args()
 245
 246         if opts.wordlist:
 247                 wordlist = opts.wordlist
 248         elif opts.random:
 249                 wordlist = None
 250         else:
 251                 sys.exit("Need to specify a wordlist")
 252         if opts.key:
 253                 key = opts.key
 254         else:
 255                 sys.exit("Need to specify a key")
 256         if opts.output:
 257                 output = opts.output
 258         else:
 259                 sys.exit("Need to specify an output directory")
 260         if opts.font and os.path.exists(opts.font):
 261                 font = opts.font
 262         else:
 263                 sys.exit("Need to specify the location of a font")
 264
 265         blacklist = read_wordlist(opts.blacklist)
 266         count = opts.count
 267         fill = opts.fill
 268         dirs = opts.dirs
 269         verbose = opts.verbose
 270         fontsize = opts.font_size
 271         threads = opts.threads
 272
 273         if fill:
 274                 count = max(0, fill - len(os.listdir(output)))
 275
 276         words = None
 277         if wordlist:
 278                 words = read_wordlist(wordlist)
 279                 words = [x for x in words
 280                         if len(x) in (4,5) and x[0] != "f"
 281                         and x[0] != x[1] and x[-1] != x[-2]]
 282
 283         if count == 0:
 284                 sys.exit("No need to generate CAPTCHA images.")
 285
 286         if count < threads:
 287                 chunks = 1
 288                 threads = 1
 289         else:
 290                 chunks = (count / threads)
 291
 292         p = multiprocessing.Pool(threads);
 293         data = []
 294         print("Generating %s CAPTCHA images separated in %s image(s) per chunk run by %s threads..." % (count, chunks, threads))
 295         for i in range(0, threads):
 296                 data.append([chunks, words, blacklist, opts, font, fontsize])
 297
 298         p.map(run_in_thread, data)