1 |
Did anyone suggest fslint? Apart from finding unnecessary files, duplicates |
2 |
|
3 |
and broken links, there's an education to be had in clever scripting behind |
4 |
it all: |
5 |
********************************************************************************************************* |
6 |
#!/bin/bash |
7 |
|
8 |
# findup - find duplicate files |
9 |
# Copyright (c) 2000-2006 by Pádraig Brady <P@××××××××××.com>. |
10 |
# |
11 |
# This program is free software; you can redistribute it and/or modify |
12 |
# it under the terms of the GNU General Public License as published by |
13 |
# the Free Software Foundation; either version 2 of the License, or |
14 |
# any later version. |
15 |
# |
16 |
# This program is distributed in the hope that it will be useful, |
17 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
19 |
# See the GNU General Public License for more details, |
20 |
# which is available at www.gnu.org |
21 |
|
22 |
|
23 |
# Description |
24 |
# |
25 |
# will show duplicate files in the specified directories |
26 |
# (and their subdirectories), in the format: |
27 |
# |
28 |
# 2 * 2048 file1 file2 |
29 |
# 3 * 1024 file3 file4 file5 |
30 |
# 2 * 1024 file6 file7 |
31 |
# |
32 |
# Where the number is the disk usage in bytes of each of the |
33 |
# duplicate files on that line, and all duplicate files are |
34 |
# shown on the same line. |
35 |
# Output it ordered by largest disk usage first and |
36 |
# then by the number of duplicate files. |
37 |
# |
38 |
# Caveats/Notes: |
39 |
# I compared this to any equivalent utils I could find (as of Nov 2000) |
40 |
# and it's (by far) the fastest, has the most functionality (thanks to |
41 |
# find) and has no (known) bugs. In my opinion fdupes is the next best |
42 |
but |
43 |
# is slower (even though written in C), and has a bug where hard links |
44 |
# in different directories are reported as duplicates sometimes. |
45 |
# |
46 |
# This script requires uniq > V2.0.21 (part of GNU textutils|coreutils) |
47 |
# undefined operation if any dir/file names contain \n or \\ |
48 |
# sparse files are not treated differently. |
49 |
# Don't specify params to find that affect output etc. (e.g -printf etc.) |
50 |
# zero length files are ignored. |
51 |
# symbolic links are ignored. |
52 |
# path1 & path2 can be files &/or directories |
53 |
|
54 |
script_dir=`dirname $0` #directory of this script |
55 |
script_dir=`readlink -f "$script_dir"` #Make sure absolute path |
56 |
|
57 |
. $script_dir/supprt/fslver |
58 |
|
59 |
Usage() { |
60 |
ProgName=`basename "$0"` |
61 |
echo "find dUPlicate files. |
62 |
Usage: $ProgName [[-t [-m|-d]] [-r] [-f] paths(s) ...] |
63 |
|
64 |
If no path(s) specified then the currrent directory is assumed. |
65 |
|
66 |
When -m is specified any found duplicates will be merged (using hardlinks). |
67 |
When -d is specified any found duplicates will be deleted (only 1 left). |
68 |
When -t is specfied, only report what -m or -d would do. |
69 |
|
70 |
You can also pipe output to $script_dir/fstool/dupwaste to |
71 |
get a total of the wastage due to duplicates. |
72 |
|
73 |
Examples: |
74 |
|
75 |
search for duplicates in current directory and below |
76 |
findup or findup . |
77 |
search for duplicates in all linux source directories and merge using |
78 |
hardlinks |
79 |
findup -m /usr/src/linux* |
80 |
same as above but don't look in subdirectories |
81 |
findup -r . |
82 |
search for duplicates in /usr/bin |
83 |
findup /usr/bin |
84 |
search in multiple directories but not their subdirectories |
85 |
findup -r /usr/bin /bin /usr/sbin /sbin |
86 |
search for duplicates in \$PATH |
87 |
findup \`$script_dir/supprt/getffp\` |
88 |
search system for duplicate files over 100K in size |
89 |
findup / -size +100k |
90 |
search only my files (that I own and are in my home dir) |
91 |
findup ~ -user \`id -u\` |
92 |
search system for duplicate files belonging to roger |
93 |
findup / -user \`id -u roger\`" |
94 |
exit |
95 |
} |
96 |
|
97 |
for arg |
98 |
do |
99 |
case "$arg" in |
100 |
-h|--help|-help) |
101 |
Usage ;; |
102 |
-v|--version) |
103 |
Version ;; |
104 |
--gui) |
105 |
mode="gui" ;; |
106 |
-m) |
107 |
mode="merge" ;; |
108 |
-d) |
109 |
mode="del" ;; |
110 |
-t) |
111 |
t="t" ;; |
112 |
*) |
113 |
argsToPassOn="$argsToPassOn '$arg'" |
114 |
esac |
115 |
done |
116 |
[ "$mode" = "merge" ] && argsToPassOn="$argsToPassOn -xdev" |
117 |
|
118 |
if [ ! -z "$mode" ]; then |
119 |
forceFullPath="-f" |
120 |
sep_mode="prepend" |
121 |
else |
122 |
sep_mode="none" |
123 |
fi |
124 |
|
125 |
if [ "$mode" = "gui" ] || [ "$mode" = "merge" ] || [ "$mode" = "del" ]; then |
126 |
merge_early="" #process hardlinks |
127 |
else |
128 |
merge_early="-u" #ignore hardlinks |
129 |
fi |
130 |
|
131 |
. $script_dir/supprt/getfpf $forceFullPath "$argsToPassOn" |
132 |
|
133 |
check_uniq |
134 |
|
135 |
if [ `find . -maxdepth 0 -printf "%D" 2> /dev/null` = "D" ] |
136 |
then |
137 |
devFmt="\060" #0 |
138 |
else |
139 |
devFmt=%D #This is new and will help find more duplicate files |
140 |
fi |
141 |
#print name, inode & size. |
142 |
find "$@" -size +0c -type f -printf "$FPF\0$devFmt\0%i\0%s\n" | |
143 |
tr ' \t\0' '\0\1 ' | #remove spaces, tabs in file names |
144 |
sort -k2,2n -k4,4nr -k3,3 $merge_early |#group [and merge] dev,size & inodes |
145 |
if [ -z "$merge_early" ]; then |
146 |
$script_dir/supprt/rmlint/merge_hardlinks |
147 |
else |
148 |
uniq -3 -D #pick just duplicate filesizes |
149 |
fi | |
150 |
sort -k3,3n | #NB sort inodes so md5sum does less seeking all over |
151 |
disk |
152 |
cut -f1 -d' ' -s | #get filenames to work on |
153 |
tr '\0\1\n' ' \t\0' |#reset any space & tabs etc and delimit names with \0 |
154 |
xargs -r0 md5sum -- |#calculate md5sums for possible duplicates |
155 |
sort | #group duplicate files together |
156 |
tr ' \t' '\1\2' | #remove spaces & tabs again (sed can't match \0) |
157 |
sed -e 's/\(^.\{32\}\)..\(.*\)/\2 \1/' | #switch sums and filenames |
158 |
|
159 |
# The following optional block, checks duplicates again using sha1 |
160 |
# Note for data sets that don't totally fit in cache this will |
161 |
# probably read duplicate files off the disk again. |
162 |
uniq --all-repeated -1 | #pick just duplicates |
163 |
cut -d' ' -f1 | #get filenames |
164 |
sort | #sort by paths to try to minimise disk seeks |
165 |
tr '\1\2\n' ' \t\0' | #reset any space & tabs etc and delimit names with |
166 |
\0 |
167 |
xargs -r0 sha1sum -- | #to be sure to be sure |
168 |
sort | #group duplicate files together |
169 |
tr ' \t' '\1\2' | #remove spaces & tabs again (sed can't match \0) |
170 |
sed -e 's/\(^.\{40\}\)..\(.*\)/\2 \1/' | #switch sums and filenames |
171 |
|
172 |
uniq --all-repeated=$sep_mode -1 | #pick just duplicates |
173 |
sed -e 's/\(^.*\) \(.*\)/\2 \1/' | #switch sums and filenames back |
174 |
tr '\1\2' ' \t' | #put spaces & tabs back |
175 |
|
176 |
if [ ! -z "$mode" ]; then |
177 |
cut -d' ' -f2- | |
178 |
if [ ! $mode = "gui" ]; then # external call to python as this is faster |
179 |
if [ -f $script_dir/supprt/rmlint/fixdup.py ]; then |
180 |
$script_dir/supprt/rmlint/fixdup.py $t$mode |
181 |
elif [ -f $script_dir/supprt/rmlint/fixdup.sh ]; then |
182 |
$script_dir/supprt/rmlint/fixdup.sh $t$mode |
183 |
else |
184 |
echo "Error, couldn't find merge util" >&2 |
185 |
exit 1 |
186 |
fi |
187 |
else |
188 |
cat |
189 |
fi |
190 |
else |
191 |
( |
192 |
psum='no match' |
193 |
line='' |
194 |
declare -i counter |
195 |
while read sum file; do #sum is delimited by first space |
196 |
if [ "$sum" != "$psum" ]; then |
197 |
if [ ! -z "$line" ]; then |
198 |
echo "$counter * $line" |
199 |
fi |
200 |
counter=1 |
201 |
line="`du -b "$file"`" |
202 |
psum="$sum" |
203 |
else |
204 |
counter=counter+1 #Use bash arithmetic, not expr (for speed) |
205 |
line="$line $file" |
206 |
fi |
207 |
done |
208 |
|
209 |
if [ ! -z "$line" ]; then |
210 |
echo "$counter * $line" |
211 |
fi |
212 |
) | |
213 |
sort -k3,3 -k1,1 -brn |
214 |
fi |
215 |
************************************************************************************************************* |
216 |
|
217 |
|
218 |
On Fri, Oct 17, 2008 at 7:12 AM, Paul Stear <gentoo@××××××××××××.com> wrote: |
219 |
|
220 |
> On Thursday 16 October 2008 15:52:59 Richard Freeman wrote: |
221 |
> |
222 |
> > To add to the chorus of suggestions, may I offer "kdirstat"? It is in |
223 |
> > portage and does a great job of mapping file use, as well as some |
224 |
> > administrative tools for cleanup. Just be careful when deleting files |
225 |
> > that you don't just move them to the trash. |
226 |
> |
227 |
> Well thanks again for all responses, kdirstat is now emerged and looks good |
228 |
> at |
229 |
> identifying all my rubbish. |
230 |
> Paul |
231 |
> |
232 |
> -- |
233 |
> This message has been sent using kmail with gentoo linux |
234 |
> |
235 |
> |