Python – Recursive Glob & Line Counter

The other day I needed a recursive glob to find all the *.py files in my home directory. Much to my amazement Python doesn’t have one built into the glob module. So, I built my own and decided to share it for anyone else who might need it. Fork it here and install it with easy_install off PyPi.

Recursive Glob Module
	rglob(base, pattern)
	lcount(base, pattern, func=lambda x : True)
import glob
import os

def _getDirs(base):
	return [x for x in glob.iglob(os.path.join( base, '*')) if os.path.isdir(x) ]

def _count(files, func):
	lines = 0
	for f in files:
		lines += sum([1 for l in open(f) if func(l)])
	return lines

def rglob(base, pattern):
	""" Recursive glob starting in specified directory """
	flist = []
	dirs = _getDirs(base)
	if len(dirs):
		for d in dirs:
			flist.extend(rglob(os.path.join(base,d), pattern))
	return flist

def rglob_(pattern):
	""" Performs a recursive glob in the current working directory """
	return rglob(os.getcwd(), pattern)

def lcount(base, pattern, func = lambda x : True):
	""" Counts the number of lines in each file found matching pattern.
			base - root directory to start the search
			pattern - pattern for glob to match (i.e '*.py')
			func - boolean filter function
				example: lambda x : True if len(x.strip()) else False #filter empty lines
				default: lambda x : True
	allFiles = rglob(base, pattern)
	return _count(allFiles, func)

if __name__ == "__main__":
	#filter out empty lines and comments
	filterFunc = lambda x : True if (len(x.strip()) and x.strip()[0] != '#') else False

	pyFiles = rglob(os.path.dirname(__file__), "*.py")
	print " {} total lines".format(_count(pyFiles, filterFunc))

	pyFiles_ = rglob_("*.py")
	print " {} total lines".format(_count(pyFiles_, filterFunc))
	print " {} total lines".format(lcount(os.path.dirname(__file__), "*.py", filterFunc))

It works by starting off in a base directory where it uses a generator expression to iterate through a glob iterator that matches any subdirectory. It then goes through these subdirectories one at a time and performs a glob pattern match on the contents of said directory. This is repeated recursively until all subdirectories off the base are inspected.