Gentoo Archives: gentoo-dev

From: Daniel Armyr <daniel.armyr@××××.se>
To: "leon j. breedt" <ljb@×××××××××.org>
Cc: gentoo-dev@g.o
Subject: Re: [gentoo-dev] orphaned files on system?
Date: Sun, 20 Apr 2003 14:50:15
Message-Id: 3EA2B2B7.7030602@home.se
In Reply to: [gentoo-dev] orphaned files on system? by "leon j. breedt"
1 Hm, gave it a run. The code looks pretty neat. I havn't studied it
2 enough to know exactly what it does, but it seem like it has some real
3 nifty features. Unfortunately It seems awfully slow and heavy on the
4 system. My first run crashed X, I think. The second hung up when caching
5 vanilla-sources. Third time worked though, and I like the brevity of the
6 list it produces. A check on the used processor time says it is about 4
7 times slower than a simple sh script that was presented a while back
8 which does no filtering of the output. It simply prints all files on the
9 system ot mentioned in /var/db/pkg. Either way, looks like a good
10 program, but I think it can be worth the attempt to try to find ways to
11 make it faster.
12 //Daniel Armyr
13
14 leon j. breedt wrote:
15
16 >hi,
17 >
18 >i use the attached script to scan for unpackaged files on my filesystem,
19 >and found quite a few in /etc, /usr/lib, /usr/X11R6 as well as the
20 >expected places. most of them were symlinks, the intention being fairly
21 >obvious (like the NVIDIA OpenGL stuff).
22 >
23 >but i was hoping someone could explain why files like /etc/make.conf, /etc/csh.env,
24 >/etc/env.d/05gcc and /usr/include/awk/acconfig.h didn't belong to any package.
25 >
26 >i run the script with:
27 >
28 >$ ./gtfilelint -v -C gtfilelint.conf -o orphans.list
29 >
30 >use -h to see available params. multiple -v increases verbosity.
31 >
32 >if you have a system with lots of packages, its going to take some time,
33 >as it caches all the /var/db/pkg/**/CONTENTS entries in a Berkeley hashdb
34 >for quick lookups, then runs /usr/bin/find on /, and compares results. exclusions
35 >to the find output are made by adding python re module regexes to
36 >gtfilelint.conf.
37 >
38 >if you run it as user, you may get some error output from find about permissions.
39 >you will want to specify a config file, otherwise you'll get a lot of stuff
40 >you probably don't care about.
41 >
42 >hope someone finds this useful
43 >
44 >leon
45 >
46 >
47 >
48 >------------------------------------------------------------------------
49 >
50 >#!/usr/bin/env python
51 >#
52 ># Finds files on Gentoo Linux systems that do not belong
53 ># to any installed package.
54 >#
55 ># Released under the GNU GPL.
56 >#
57 ># (C) Copyright 2003 Leon J. Breedt
58 >#
59 ># $Id$
60 >
61 >import dbhash
62 >import getopt
63 >import os
64 >import os.path
65 >import re
66 >import string
67 >import sys
68 >
69 >TRUE = 1
70 >FALSE = 0
71 >version = '0.1.1'
72 >configfile = '/etc/gtfilelint.conf'
73 >dbdir = '/var/db/pkg'
74 >cachefile = '/tmp/gtfilelint.db'
75 >outputfile = None
76 >warnmissing = FALSE
77 >findcmd = 'find / -print'
78 >exclusions = []
79 >verbosity = 0
80 >cachedb = None
81 >
82 >def verb(msg, level=1):
83 > if verbosity >= level:
84 > sys.stderr.write('-- %s\n' % msg)
85 >
86 >def vverb(msg):
87 > verb(msg, 2)
88 >
89 >def info(msg):
90 > sys.stderr.write('>> %s\n' % msg)
91 >
92 >def error(msg):
93 > sys.stderr.write('error: %s\n' % msg)
94 > sys.exit(1)
95 >
96 >def warn(msg):
97 > sys.stderr.write('warning: %s\n' % msg)
98 >
99 >def usage():
100 > print 'usage: %s [options]' % sys.argv[0]
101 > print 'options:'
102 > print '-h|--help display this message'
103 > print '-V|--version print program version and exit'
104 > print '-v|--verbose print verbose messages about what is being done'
105 > print '-d|--dbdir directory containing package database (default: %s)' % dbdir
106 > print '-c|--cachefile file to place temporary cache in (default: %s)' % cachefile
107 > print '-C|--configfile configuration file (default: %s)' % configfile
108 > print '-o|--outputfile file to print orphan list to (default: stdout)'
109 > print '--warnmissing warn if files declared in CONTENTS don\'t exist'
110 >
111 >def parse_cmdline():
112 > global verbosity, dbdir, cachefile, configfile, outputfile, warnmissing
113 > opts, args = getopt.getopt(sys.argv[1:], "hVvd:c:C:o:", ["help", "version", "verbose", "dbdir=", "cachefile=", "configfile=", "outputfile=", "warnmissing"])
114 > for opt, arg in opts:
115 > if opt in ("-h", "--help"):
116 > usage()
117 > sys.exit(0)
118 > if opt in ("-V", "--version"):
119 > print version
120 > sys.exit(0)
121 > if opt in ("-v", "--verbose"):
122 > verbosity = verbosity + 1
123 > if opt in ("-d", "--dbdir"):
124 > dbdir = arg
125 > if opt in ("-c", "--cachefile"):
126 > cachefile = arg
127 > if opt in ("-C", "--configfile"):
128 > configfile = arg
129 > if opt in ("-o", "--outputfile"):
130 > outputfile = arg
131 > if opt == "--warnmissing":
132 > warnmissing = TRUE
133 >
134 >def parse_config():
135 > if not os.path.exists(configfile) or not os.access(configfile, os.R_OK):
136 > warn('missing configfile "%s"' % configfile)
137 > return
138 > fp = open(configfile, 'r')
139 > for line in fp.readlines():
140 > line = string.strip(line)
141 > if len(line) == 0:
142 > continue
143 > if line[0] == '#':
144 > continue
145 > exclusions.append(re.compile(line))
146 > verb('adding "%s" to list of exclusion regular expressions' % line)
147 > fp.close()
148 >
149 >def cache_package_files(package, packagepath):
150 > verb('caching contents of "%s"' % package)
151 > fp = open(packagepath + '/CONTENTS')
152 > lineno = 0
153 > for line in fp.readlines():
154 > lineno = lineno + 1
155 > line = string.strip(line)
156 > if len(line) == 0:
157 > continue
158 > key = None
159 > m = re.match(r"^dir (\S.*)$", line)
160 > if m:
161 > key = m.group(1)
162 > m = None
163 > else:
164 > m = re.match(r"^obj (\S.*) (\S+) (\d+)\s*$", line)
165 > if m:
166 > key = m.group(1)
167 > m = None
168 > m = re.match(r"^sym (\S.*) -> .*$", line)
169 > if m:
170 > key = m.group(1)
171 > if key != None:
172 > if not os.path.exists(key) and warnmissing:
173 > warn('%s: "%s" does not exist on filesystem, ignoring' % (package, key))
174 > vverb('caching "%s"' % key)
175 > cachedb[key] = ''
176 > else:
177 > vverb('key is None for "%s" CONTENTS line %d' % (package, lineno))
178 > fp.close()
179 >
180 >def scan_group_packages(group, grouppath):
181 > packages = os.listdir(grouppath)
182 > packages.sort()
183 > verb('found %d packages in group "%s"' % (len(packages), group))
184 > for package in packages:
185 > packagepath = grouppath + '/' + package
186 > cache_package_files(package, packagepath)
187 >
188 >def create_system_filelist():
189 > info('scanning all files on system')
190 > sout = os.popen(findcmd, 'r')
191 > verb('reading paths from "%s"' % findcmd)
192 > paths = sout.readlines()
193 > orphans = 0
194 > rptfp = None
195 > for path in paths:
196 > path = string.strip(path)
197 > if len(path) == 0:
198 > continue
199 > if path[0] != '/':
200 > warn('ignoring relative path "%s"' % path)
201 > continue
202 > matched = FALSE
203 > for exre in exclusions:
204 > if exre.match(path):
205 > matched = TRUE
206 > break
207 > if matched:
208 > vverb('"%s" matched exclusion regex, ignoring' % path)
209 > continue
210 > if not cachedb.has_key(path):
211 > if orphans == 0:
212 > if outputfile:
213 > info('writing orphaned file list [%s]' % outputfile)
214 > rptfp = open(outputfile, 'w+')
215 > else:
216 > info('orphaned files:')
217 > rptfp = sys.stdout
218 > rptfp.flush()
219 > orphans = orphans + 1
220 > rptfp.write('%s\n' % path)
221 > rptfp.flush()
222 > if rptfp:
223 > rptfp.close()
224 > sout.close()
225 > if orphans > 0:
226 > info('%d orphaned file(s) found' % orphans)
227 > else:
228 > info('no orphaned files on system')
229 >
230 ># Main
231 >try:
232 > parse_cmdline()
233 > parse_config()
234 > info('creating packaged files cache [%s]' % cachefile)
235 > cachedb = dbhash.open(cachefile, 'n')
236 > try:
237 > groups = os.listdir(dbdir)
238 > groups.sort()
239 > for group in groups:
240 > grouppath = dbdir + '/' + group
241 > scan_group_packages(group, grouppath)
242 > create_system_filelist()
243 > finally:
244 > if cachedb:
245 > cachedb.close()
246 > os.unlink(cachefile)
247 >except KeyboardInterrupt: 1
248 >except:
249 > raise
250 >
251 >
252 >------------------------------------------------------------------------
253 >
254 ># we don't really care about these dynamic paths
255 >^/var/log/.*
256 >^/var/db/.*
257 >^/var/spool/.*
258 >^/var/tmp/.*
259 >^/var/lib/.*
260 >^/var/cache/.*
261 >^/var/run/.*
262 >
263 ># / is not owned by any package
264 >^/$
265 >
266 ># don't care about root's config files
267 >^/root/.*
268 >
269 ># /usr/local is typically just user compiled stuff,
270 ># don't care about it -- this list from baselayout
271 >^/usr/local/bin/.*
272 >^/usr/local/doc$
273 >^/usr/local/lib/.*
274 >^/usr/local/man$
275 >^/usr/local/src/.*
276 >^/usr/local/sbin/.*
277 >^/usr/local/games/.*
278 >^/usr/local/share/doc/.*
279 >^/usr/local/share/man/.*
280 >^/usr/local/share/.*
281 >
282 ># what is /lib/dev-state? dunno...but the dir is in
283 ># baselayout, even if the files arent
284 >^/lib/dev-state/.*
285 >
286 ># devices aren't that important to us...the packaged
287 ># files will not be visible anyway due to devfs
288 >^/dev/.*
289 >
290 ># anyone packaging anything into /tmp should be shot
291 >^/tmp/.*
292 >
293 ># portage tree we don't care about either
294 >^/usr/portage$
295 >^/usr/portage/.*
296 >
297 ># mountpoints shouldn't have package files installed in them
298 >^/mnt/.*
299 >
300 ># system filesystems should be ignored
301 >^/proc/.*
302 >^/sys/.*
303 >
304 ># USER CUSTOMIZATIONS
305 >^/data
306 >^/data/.*
307 >^/cdrom.*
308 >^/windata
309 >^/windata/.*
310 >
311 >
312
313
314 --
315 gentoo-dev@g.o mailing list