mirror of
https://github.com/9fans/plan9port.git
synced 2025-01-15 11:20:03 +00:00
57 lines
1.2 KiB
Awk
57 lines
1.2 KiB
Awk
|
# when raw index has a lot of entries like
|
||
|
# 1578324 problematico, a, ci, che
|
||
|
# apply this algorithm:
|
||
|
# treat things after comma as suffixes
|
||
|
# for each suffix:
|
||
|
# if single letter, replace last letter
|
||
|
# else search backwards for beginning of suffix
|
||
|
# and if it leads to an old suffix of approximately
|
||
|
# the same length, put replace that suffix
|
||
|
# This will still leave some commas to fix by hand
|
||
|
# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
|
||
|
|
||
|
NF == 2 {
|
||
|
i = index($2, ",")
|
||
|
if(i == 0 || length($2) == 0)
|
||
|
print $0
|
||
|
else {
|
||
|
n = split($2, a, /,[ ]*/)
|
||
|
w = a[1]
|
||
|
printf "%s\t%s\n", $1, w
|
||
|
for(i = 2; i <= n; i++) {
|
||
|
suf = a[i]
|
||
|
m = matchsuflen(w, suf)
|
||
|
if(m) {
|
||
|
nw = substr(w, 1, length(w)-m) suf
|
||
|
printf "%s\t%s\n", $1, nw
|
||
|
} else
|
||
|
printf "%s\t%s\n", $1, w ", " suf
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
NF != 2 {
|
||
|
print $0
|
||
|
}
|
||
|
|
||
|
function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
|
||
|
{
|
||
|
wlen = length(w)
|
||
|
suflen = length(suf)
|
||
|
if(suflen == 1)
|
||
|
return 1
|
||
|
else {
|
||
|
c = substr(suf, 1, 1)
|
||
|
for (k = 1; k <= wlen ; k++)
|
||
|
if(substr(w, wlen-k+1, 1) == c)
|
||
|
break
|
||
|
if(k > wlen)
|
||
|
return 0
|
||
|
d = k-suflen
|
||
|
if(d < 0)
|
||
|
d = -d
|
||
|
if(d > 3)
|
||
|
return 0
|
||
|
return k
|
||
|
}
|
||
|
}
|