You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

844 lines
26 KiB
Python

#!/usr/bin/env python
# -*- coding: utf8 -*-
# ____ ____ __ ____ __ _ ____ __ _ _ ____ __ __ _ ____
# ( __)( _ \( )( __)( ( \( \( ) ( \/ )___( __)( )( ( \( \
# ) _) ) / )( ) _) / / ) D (/ (_/\ ) /(___)) _) )( / / ) D (
# (__) (__\_)(__)(____)\_)__)(____/\____/(__/ (__) (__)\_)__)(____/
#
# The friendlier file finder.
import time
import os
import optparse
import string
import sys
import re
from optparse import OptionParser, OptionGroup
# Constants -------------------------------------------------------------------
CASE_SENSITIVE = 1
CASE_INSENSITIVE = 2
CASE_SMART = 3
BYTE = 1
KILOBYTE = 1024 * BYTE
MEGABYTE = 1024 * KILOBYTE
GIGABYTE = 1024 * MEGABYTE
TERABYTE = 1024 * GIGABYTE
PETABYTE = 1024 * TERABYTE
VCS_DIRS = ['.hg', '.git', '.svn']
TYPE_FILE_REAL = 1
TYPE_FILE_SYMLINK = 2
TYPE_DIR_REAL = 3
TYPE_DIR_SYMLINK = 4
TYPES_FILE_REAL = set([TYPE_FILE_REAL])
TYPES_FILE_SYMLINK = set([TYPE_FILE_SYMLINK])
TYPES_DIR_REAL = set([TYPE_DIR_REAL])
TYPES_DIR_SYMLINK = set([TYPE_DIR_SYMLINK])
TYPES_FILE = TYPES_FILE_REAL | TYPES_FILE_SYMLINK
TYPES_DIR = TYPES_DIR_REAL | TYPES_DIR_SYMLINK
TYPES_REAL = TYPES_FILE_REAL | TYPES_DIR_REAL
TYPES_SYMLINK = TYPES_FILE_SYMLINK | TYPES_DIR_SYMLINK
TYPES_ALL = TYPES_FILE | TYPES_DIR
SECOND = 1
MINUTE = 60 * SECOND
HOUR = 60 * MINUTE
DAY = 24 * HOUR
WEEK = 7 * DAY
MONTH = 30 * DAY
YEAR = int(365.2425 * DAY)
IGNORE_SYNTAX_REGEX = 1
IGNORE_SYNTAX_GLOB = 2
IGNORE_SYNTAX_LITERAL = 3
IGNORE_MODE_RESTRICTED = 1
IGNORE_MODE_SEMI = 2
IGNORE_MODE_UNRESTRICTED = 3
IGNORE_MODE_ALL = 4
# Regexes ---------------------------------------------------------------------
SIZE_RE = re.compile(r'^(\d+(?:\.\d+)?)([bkmgtp])?[a-z]*$', re.IGNORECASE)
AGO_RE = re.compile(r'''
(\d+(?:\.\d+)?) # The number (float/int)
\s* # Optional whitespace
( # Units
y(?:ears?)? # y/year/years
| mos?(?:nths?)? # mo/mos/month/months
| w(?:eeks?)? # w/week/weeks
| d(?:ays?)? # d/day/days
| h(?:ours?)? # h/hour/hours
| m(?:ins?(?:utes?)?)? # m/min/mins/minute/minutes
| s(?:ecs?(?:onds?)?)? # s/sec/secs/second/seconds
)
''', re.VERBOSE | re.IGNORECASE)
IGNORE_SYNTAX_RE = re.compile(r'^\s*syntax:\s*(glob|regexp|regex|re|literal)\s*$',
re.IGNORECASE)
IGNORE_COMMENT_RE = re.compile(r'^\s*#')
IGNORE_BLANK_RE = re.compile(r'^\s*$')
GITIGNORE_COMMENT_RE = re.compile(r'^\s*#')
GITIGNORE_BLANK_RE = re.compile(r'^\s*$')
GITIGNORE_NEGATE_RE = re.compile(r'^\s*!')
HGIGNORE_SYNTAX_RE = re.compile(r'^\s*syntax:\s*(glob|regexp|re)\s*$',
re.IGNORECASE)
HGIGNORE_COMMENT_RE = re.compile(r'^\s*#')
HGIGNORE_BLANK_RE = re.compile(r'^\s*$')
# Global Options --------------------------------------------------------------
# (it's a prototype, shut up)
options = None
# Output ----------------------------------------------------------------------
def out(s, line_ending='\n'):
sys.stdout.write(s + line_ending)
def err(s):
sys.stderr.write(s + '\n')
def die(s, exitcode=1):
err('error: ' + s)
sys.exit(exitcode)
def warn(s):
sys.stderr.write('warning: ' + s + '\n')
# Ingore Files ----------------------------------------------------------------
def compile_re(line):
try:
r = re.compile(line)
return lambda s: r.search(s)
except:
warn('could not compile regular expression "%s"' % line)
return lambda s: False
def glob_to_re(glob):
pat = ''
chs = list(glob)
while chs:
ch = chs.pop(0)
if ch == '\\':
pat += re.escape(chs.pop(0))
elif ch == '?':
pat += '.'
elif ch == '*':
if chs and chs[0] == '*':
chs.pop(0)
pat += '.*'
else:
pat += '[^/]*'
elif ch == '[':
pat += '['
ch = chs.pop(0)
while chs and ch != ']':
pat += ch
ch = chs.pop(0)
pat += ']'
else:
pat += re.escape(ch)
return pat
def compile_literal(line):
l = line
return lambda s: l in s
def compile_git(line):
original_line = line
pat = ''
# From man gitignore 5:
# If the pattern ends with a slash, it is removed for the purpose of the
# following description, but it would only find a match with
# a directory. In other words, foo/ will match a directory foo and paths
# underneath it, but will not match a regular file or a symbolic link
# foo (this is consistent with the way how pathspec works in general in
# git).
#
# A leading slash matches the beginning of the pathname. For example,
# "/*.c" matches "cat-file.c" but not "mozilla-sha1/sha1.c".
#
# If the pattern does not contain a slash /, git treats it as a shell
# glob pattern and checks for a match against the pathname relative to
# the location of the .gitignore file (relative to the toplevel of the
# work tree if not from a .gitignore file).
#
# Otherwise, git treats the pattern as a shell glob suitable for
# consumption by fnmatch(3) with the FNM_PATHNAME flag: wildcards in the
# pattern will not match a / in the pathname. For example,
# "Documentation/*.html" matches "Documentation/git.html" but not
# "Documentation/ppc/ppc.html" or "tools/perf/Documentation/perf.html".
#
# If you can't tell what the hell this means you're not alone, because git's
# documentation is fucking inscrutable. Here's what I've come up with from
# trial and error:
#
# 0. Patterns ending in a slash will only match directories, and then you
# can ignore that slash for the rest of these rules.
# 1. Patterns are shell globs, except * doesn't match / and there's no **.
# 2. Patterns without a slash search the basename of the path, for example:
# the 'file.txt' in '/foo/bar/file.txt'.
# 3. Patterns with a slash search against the entire path.
# 4. All matching must match the entire string it's searching. For example:
#
# 'am' will not ignore '/foo/bar/spam'
# it matches against the basename 'spam' but does not match all of it
#
# 'bar/spam' will not ignore '/foo/bar/spam'
# it matches against the full path (because it has a slash) but does not
# match all of it.
# 5. A leading slash doesn't affect the matching, but does turn a
# "pattern with no slash" into a "pattern with a slash". So:
#
# 'bar' will ignore '/foo/bar/spam' (actually it'll ignore bar entirely)
# it matches against the basename 'bar' (because there's no slash) when
# at that level
#
# '/bar' will not ignore '/foo/bar/spam'
# it matches against the entire path '/foo/bar' (because there is
# a slash) when at that level
if line.endswith('/'):
# TODO: Deal with this.
# directories_only = True
line = line[:-1]
has_slash = '/' in line
line = line.lstrip('/')
if has_slash:
# Patterns with a slash have to match against the entire pathname. So
# they need to be rooted at the beginning.
pat += '^./'
else:
# Patterns without a slash match against just the basename, which we'll
# simulate by including the (final) divider in the pattern.
pat += '/'
# The rest of the pattern is git's variation on shell globs.
# Mostly normal shell globs, but there's no **.
chs = list(line)
while chs:
ch = chs.pop(0)
if ch == '?':
pat += '.'
elif ch == '*':
pat += '[^/]*'
elif ch == '[':
pat += '['
ch = chs.pop(0)
while chs and ch != ']':
pat += ch
ch = chs.pop(0)
pat += ']'
else:
pat += re.escape(ch)
# Patterns always have the be anchored at the end.
pat += '$'
try:
return compile_re(pat)
except:
warn("could not parse gitignore pattern '%s'" % original_line)
return lambda s: True
def compile_hg_glob(line):
pat = glob_to_re(line)
# Mercurial ignore globs are quasi-rooted at directory boundaries or the
# beginning of the pattern.
pat = '(^|/)' + pat
# Mercurial globs also have to match to the end of the pattern.
pat = pat + '$'
try:
regex = re.compile(pat)
return lambda s: regex.search(s[2:] if s.startswith('./') else s)
except:
warn("could not parse hgignore pattern '%s'" % line)
return lambda s: True
def compile_ff_glob(line):
pat = glob_to_re(line)
try:
return compile_re(pat)
except:
warn("could not parse ffignore pattern '%s'" % line)
return lambda s: True
def parse_gitignore_file(path):
if not os.path.isfile(path):
return []
ignorers = []
with open(path) as f:
for line in f.readlines():
line = line.rstrip('\n')
if GITIGNORE_BLANK_RE.match(line):
continue
elif GITIGNORE_COMMENT_RE.match(line):
continue
elif GITIGNORE_NEGATE_RE.match(line):
# TODO: This bullshit feature.
continue
else:
# This line is a gitignore pattern.
ignorers.append(compile_git(line))
return ignorers
def parse_hgignore_file(path):
if not os.path.isfile(path):
return []
syntax = IGNORE_SYNTAX_REGEX
ignorers = []
with open(path) as f:
for line in f.readlines():
line = line.rstrip('\n')
if HGIGNORE_BLANK_RE.match(line):
continue
elif HGIGNORE_COMMENT_RE.match(line):
continue
elif HGIGNORE_SYNTAX_RE.match(line):
s = HGIGNORE_SYNTAX_RE.match(line).groups()[0].lower()
if s == 'glob':
syntax = IGNORE_SYNTAX_GLOB
elif s in ['re', 'regexp']:
syntax = IGNORE_SYNTAX_REGEX
else:
# This line is a pattern.
if syntax == IGNORE_SYNTAX_REGEX:
ignorers.append(compile_re(line))
elif syntax == IGNORE_SYNTAX_GLOB:
ignorers.append(compile_hg_glob(line))
return ignorers
def parse_ffignore_file(path):
if not os.path.isfile(path):
return []
syntax = IGNORE_SYNTAX_REGEX
ignorers = []
with open(path) as f:
for line in f.readlines():
line = line.rstrip('\n')
if IGNORE_BLANK_RE.match(line):
continue
elif IGNORE_COMMENT_RE.match(line):
continue
elif IGNORE_SYNTAX_RE.match(line):
s = IGNORE_SYNTAX_RE.match(line).groups()[0].lower()
if s == 'literal':
syntax = IGNORE_SYNTAX_LITERAL
elif s == 'glob':
syntax = IGNORE_SYNTAX_GLOB
elif s in ['re', 'regex', 'regexp']:
syntax = IGNORE_SYNTAX_REGEX
else:
# This line is a pattern.
if syntax == IGNORE_SYNTAX_LITERAL:
ignorers.append(compile_literal(line))
elif syntax == IGNORE_SYNTAX_REGEX:
ignorers.append(compile_re(line))
elif syntax == IGNORE_SYNTAX_GLOB:
ignorers.append(compile_ff_glob(line))
return ignorers
def parse_ignore_files(dir):
ignorers = []
for filename in options.ignore_files:
target = os.path.join(dir, filename)
if filename == '.ffignore':
ignorers.extend(parse_ffignore_file(target))
elif filename == '.gitignore':
ignorers.extend(parse_gitignore_file(target))
elif filename == '.hgignore':
ignorers.extend(parse_hgignore_file(target))
return ignorers
def get_initial_ignorers():
if '.ffignore' in options.ignore_files:
home = os.environ.get('HOME')
if home:
return parse_ffignore_file(os.path.join(home, '.ffignore'))
else:
return []
else:
return []
# Searching! ------------------------------------------------------------------
def get_type(path):
link = os.path.islink(path)
dir = os.path.isdir(path)
if link and dir:
return TYPE_DIR_SYMLINK
elif link and not dir:
return TYPE_FILE_SYMLINK
elif not link and dir:
return TYPE_DIR_REAL
elif not link and not dir:
return TYPE_FILE_REAL
def should_ignore(basename, path, ignorers):
if options.ignore_vcs_dirs and basename in VCS_DIRS:
return True
for i in ignorers:
if i(path):
return True
return False
def match(query, path, basename):
def _match():
if options.type != TYPES_ALL:
if get_type(path) not in options.type:
return False
if not query(path if options.entire else basename):
return False
stat = os.lstat(path)
if options.larger_than:
if stat.st_size < options.larger_than:
return False
if options.smaller_than:
if stat.st_size > options.smaller_than:
return False
if options.before:
if stat.st_mtime > options.before:
return False
if options.after:
if stat.st_mtime < options.after:
return False
if not options.binary:
# We open in non-blocking mode so things like file-based sockets
# don't hang while waiting for their full kb.
# TODO: Ignore those altogether for the binary check?
fd = os.open(path, os.O_NONBLOCK)
with os.fdopen(fd) as f:
if '\0' in f.read(1024):
return False
return True
result = _match()
return not result if options.invert else result
def _search(query, dir, depth, ignorers):
ignorers = ignorers + parse_ignore_files(dir)
contents = os.listdir(dir)
next = []
for item in contents:
path = os.path.join(dir, item)
if not should_ignore(item, path, ignorers):
if match(query, path, item):
out(path, '\0' if options.zero else '\n')
is_dir = os.path.isdir(path)
if is_dir:
if options.follow or not os.path.islink(path):
next.append(path)
if depth < options.depth:
for d in next:
_search(query, d, depth + 1, ignorers)
def search(query, dir='.', depth=0, ignorers=None):
_search(query, '.', 0, get_initial_ignorers())
# Option Parsing and Main -----------------------------------------------------
def build_option_parser():
p = OptionParser("usage: %prog [options] PATTERN")
# Main options
p.add_option('-d', '--dir', default='.',
help='root the search in DIR (default .)',
metavar='DIR')
p.add_option('-D', '--depth', default='25',
help='search at most N directories deep (default 25)',
metavar='N')
p.add_option('-f', '--follow',
action='store_true', default=False,
help='follow symlinked directories and search their contents')
p.add_option('-F', '--no-follow',
dest='follow', action='store_false',
help="don't follow symlinked directories (default)")
p.add_option('-0', '--print0', dest='zero',
action='store_true', default=False,
help='separate matches with a null byte in output')
p.add_option('-l', '--literal',
action='store_true', default=False,
help='force literal search, even if it looks like a regex')
p.add_option('-v', '--invert',
action='store_true', default=False,
help='invert match')
p.add_option('-e', '--entire',
action='store_true', default=False,
help='match PATTERN against the entire path string')
p.add_option('-E', '--non-entire', dest='entire',
action='store_false',
help='match PATTERN against only the filenames (default)')
# Case sensitivity
g = OptionGroup(p, "Configuring Case Sensitivity")
g.add_option('-s', '--case-sensitive',
dest='case', action='store_const', const=CASE_SENSITIVE,
default=CASE_SENSITIVE,
help='case sensitive matching (default)')
g.add_option('-i', '--case-insensitive',
dest='case', action='store_const', const=CASE_INSENSITIVE,
help='case insensitive matching')
g.add_option('-S', '--case-smart',
dest='case', action='store_const', const=CASE_SMART,
help='smart case matching (sensitive if any uppercase chars '
'are in the pattern, insensitive otherwise)')
p.add_option_group(g)
# Ignoring
g = OptionGroup(p, "Configuring Ignoring")
g.add_option('-b', '--binary',
dest='binary', action='store_true', default=True,
help="allow binary files (default)")
g.add_option('-B', '--no-binary',
dest='binary', action='store_false',
help='ignore binary files')
g.add_option('-r', '--restricted', dest='ignore_mode',
action='store_const', const=IGNORE_MODE_RESTRICTED,
default=IGNORE_MODE_RESTRICTED,
help="restricted search (skip VCS directories, "
"parse all ignore files) (default)")
g.add_option('-q', '--semi-restricted', dest='ignore_mode',
action='store_const', const=IGNORE_MODE_SEMI,
help="semi-restricted search (don't parse VCS ignore files, "
"but still skip VCS directories and parse .ffignore)")
g.add_option('-u', '--unrestricted', dest='ignore_mode',
action='store_const', const=IGNORE_MODE_UNRESTRICTED,
help="unrestricted search (don't parse ignore files, but "
"still skip VCS directories)")
g.add_option('-a', '--all', dest='ignore_mode',
action='store_const', const=IGNORE_MODE_ALL,
help="don't ignore anything (ALL files can match)")
g.add_option('-I', '--ignore', metavar='PATTERN',
action='append',
help="add a pattern to be ignored (can be given multiple times)")
p.add_option_group(g)
# Time filtering
g = OptionGroup(p, "Time Filtering")
g.add_option('--before',
help='match files modified < TIME',
metavar='TIME')
g.add_option('--after',
help='match files modified > TIME',
metavar='TIME')
g.add_option('--until',
help='match files modified <= TIME',
metavar='TIME')
g.add_option('--since',
help='match files modified >= TIME',
metavar='TIME')
g.add_option('--at',
help='match files modified at TIME',
metavar='TIME')
g.add_option('--created-before',
help='match files created < TIME',
metavar='TIME')
g.add_option('--created-after',
help='match files created > TIME',
metavar='TIME')
g.add_option('--created-until',
help='match files created <= TIME',
metavar='TIME')
g.add_option('--created-since',
help='match files created >= TIME',
metavar='TIME')
g.add_option('--created-at',
help='match files created at TIME',
metavar='TIME')
# TODO
# p.add_option_group(g)
# Size filtering
g = OptionGroup(p, "Size Filtering",
"Sizes can be given as a number followed by a prefix. Some examples: "
"1k, 5kb, 1.5gb, 2g, 1024b")
g.add_option('--larger-than',
help='match files larger than SIZE (inclusive)',
metavar='SIZE')
g.add_option('--bigger-than', dest='larger_than',
help=optparse.SUPPRESS_HELP)
g.add_option('--smaller-than',
help='match files smaller than SIZE (inclusive)',
metavar='SIZE')
p.add_option_group(g)
# Type filtering
g = OptionGroup(p, "Type Filtering",
"Possible types are "
"a (all), "
"f (files), "
"d (dirs), "
"r (real), "
"s (symlinked), "
"e (real files), "
"c (real dirs), "
"x (symlinked files), "
"y (symlinked dirs). "
"If multiple types are given they will be unioned together: "
"--type 'es' would match real files and all symlinks.")
g.add_option('-t', '--type',
action='store', default=False, metavar='TYPE(S)',
help='match only specific types of things (files, dirs, non-symlinks, symlinks)')
p.add_option_group(g)
return p
def build_type_set(types):
if not types:
return TYPES_ALL
result = set()
for c in types:
result = result | {
'a': TYPES_ALL,
'e': TYPES_FILE_REAL,
'x': TYPES_FILE_SYMLINK,
'c': TYPES_DIR_REAL,
'y': TYPES_DIR_SYMLINK,
'f': TYPES_FILE,
'd': TYPES_DIR,
'r': TYPES_REAL,
's': TYPES_SYMLINK,
}[c.lower()]
return result
def parse_size(size):
size = size.replace(' ', '') if size else size
if not size:
return None
m = SIZE_RE.match(size)
if not m:
die('invalid size "%s"' % size)
n, unit = m.groups()
try:
n = float(n)
except ValueError:
die('invalid size "%s"' % size)
unit = {
'b': BYTE,
'k': KILOBYTE,
'm': MEGABYTE,
'g': GIGABYTE,
't': TERABYTE,
'p': PETABYTE,
}[unit or 'b']
return int(n * unit)
def is_re(s):
"""Try to guess if the string is a regex.
Err on the side of "True", because treating a literal like a regex only
slows you down a bit, but the other way around is broken behaviour.
"""
return not all(c.lower() in string.letters + '_-' for c in s)
def clean_ago_piece(n, unit):
n = float(n)
if unit in ['s', 'sec', 'secs', 'second', 'seconds']:
unit = SECOND
if unit in ['m', 'min', 'mins', 'minute', 'minutes']:
unit = MINUTE
if unit in ['h', 'hour', 'hours']:
unit = HOUR
if unit in ['d', 'day', 'days']:
unit = DAY
if unit in ['w', 'week', 'weeks']:
unit = WEEK
if unit in ['mo', 'mos', 'month', 'months']:
unit = MONTH
if unit in ['y', 'year', 'years']:
unit = YEAR
return n, unit
def parse_ago(start_time, timestr):
pieces = AGO_RE.findall(timestr)
units = set()
result = start_time
for piece in pieces:
n, unit = clean_ago_piece(*piece)
if unit in units:
die('duplicate "%s" in time specification' % unit)
units.add(unit)
result -= n * unit
return int(result)
def parse_time(timestr):
"""Parse a time string into milliseconds past the epoch."""
start_time = int(time.time())
timestr = timestr.strip().lower()
if AGO_RE.match(timestr):
return parse_ago(start_time, timestr)
return None
def main():
global options
(options, args) = build_option_parser().parse_args()
# PATTERN
if len(args) > 1:
die("only one search pattern can be given")
sys.exit(1)
query = args[0] if args else ''
# --dir
if options.dir:
try:
os.chdir(options.dir)
except OSError:
die('could not change to directory "%s"' % options.dir)
# --depth
try:
options.depth = int(options.depth)
except ValueError:
die('depth must be a non-negative integer (got "%s")' % options.depth)
# --case-*
if options.case == CASE_SMART:
if any(c in string.uppercase for c in query):
options.case = CASE_SENSITIVE
else:
options.case = CASE_INSENSITIVE
# --type
options.type = build_type_set(options.type)
# --larger-than, --smaller-than
options.larger_than = parse_size(options.larger_than)
options.smaller_than = parse_size(options.smaller_than)
if options.larger_than or options.smaller_than:
# Directory sizes are not supported.
options.type = options.type - TYPES_DIR
# time filtering
if options.before:
options.before = parse_time(options.before)
if options.after:
options.after = parse_time(options.after)
# Ignore files
if options.ignore_mode == IGNORE_MODE_RESTRICTED:
options.ignore_files = ['.ffignore', '.gitignore', '.hgignore']
options.ignore_vcs_dirs = True
elif options.ignore_mode == IGNORE_MODE_SEMI:
options.ignore_files = ['.ffignore']
options.ignore_vcs_dirs = True
elif options.ignore_mode == IGNORE_MODE_UNRESTRICTED:
options.ignore_files = []
options.ignore_vcs_dirs = True
elif options.ignore_mode == IGNORE_MODE_ALL:
options.ignore_files = []
options.ignore_vcs_dirs = False
# Build the query matcher.
if options.literal or not is_re(query):
if options.case == CASE_SENSITIVE:
literal = query
query = lambda s: literal in s
else:
literal = query.lower()
query = lambda s: literal in s.lower()
else:
if options.case == CASE_SENSITIVE:
r = re.compile(query)
else:
r = re.compile(query, re.IGNORECASE)
query = lambda s: r.search(s)
# Go!
search(query)
if __name__ == '__main__':
import signal
def sigint_handler(signal, frame):
sys.stdout.write('\n')
sys.exit(130)
signal.signal(signal.SIGINT, sigint_handler)
main()