tr.c (6458B)
1 /* See LICENSE file for copyright and license details. */ 2 #include <stdlib.h> 3 4 #include "utf.h" 5 #include "util.h" 6 7 static int cflag = 0; 8 static int dflag = 0; 9 static int sflag = 0; 10 11 struct range { 12 Rune start; 13 Rune end; 14 size_t quant; 15 }; 16 17 static struct { 18 char *name; 19 int (*check)(Rune); 20 } classes[] = { 21 { "alnum", isalnumrune }, 22 { "alpha", isalpharune }, 23 { "blank", isblankrune }, 24 { "cntrl", iscntrlrune }, 25 { "digit", isdigitrune }, 26 { "graph", isgraphrune }, 27 { "lower", islowerrune }, 28 { "print", isprintrune }, 29 { "punct", ispunctrune }, 30 { "space", isspacerune }, 31 { "upper", isupperrune }, 32 { "xdigit", isxdigitrune }, 33 }; 34 35 #define ISLOWERBIT 1U << 6 36 #define ISUPPERBIT 1U << 10 37 38 static struct range *set1 = NULL; 39 static size_t set1ranges = 0; 40 static unsigned set1checks = 0; 41 static struct range *set2 = NULL; 42 static size_t set2ranges = 0; 43 static unsigned set2checks = 0; 44 45 static int 46 check(Rune rune, unsigned checks) 47 { 48 size_t i; 49 50 for (i = 0; checks && i < LEN(classes); i++, checks >>= 1) 51 if (checks & 1 && classes[i].check(rune)) 52 return 1; 53 54 return 0; 55 } 56 57 static size_t 58 rangelen(struct range r) 59 { 60 return (r.end - r.start + 1) * r.quant; 61 } 62 63 static size_t 64 setlen(struct range *set, size_t setranges) 65 { 66 size_t len = 0, i; 67 68 for (i = 0; i < setranges; i++) 69 len += rangelen(set[i]); 70 71 return len; 72 } 73 74 static int 75 rstrmatch(Rune *r, char *s, size_t n) 76 { 77 size_t i; 78 79 for (i = 0; i < n; i++) 80 if (r[i] != s[i]) 81 return 0; 82 return 1; 83 } 84 85 static size_t 86 makeset(char *str, struct range **set, unsigned *checks) 87 { 88 Rune *rstr; 89 size_t len, i, j, m, n; 90 size_t q, setranges = 0; 91 int factor, base; 92 93 /* rstr defines at most len ranges */ 94 unescape(str); 95 rstr = ereallocarray(NULL, utflen(str) + 1, sizeof(*rstr)); 96 len = utftorunestr(str, rstr); 97 *set = ereallocarray(NULL, len, sizeof(**set)); 98 99 for (i = 0; i < len; i++) { 100 if (rstr[i] == '[') { 101 j = i; 102 nextbrack: 103 if (j >= len) 104 goto literal; 105 for (m = j; m < len; m++) 106 if (rstr[m] == ']') { 107 j = m; 108 break; 109 } 110 if (j == i) 111 goto literal; 112 113 /* CLASSES [=EQUIV=] (skip) */ 114 if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') { 115 if (j - i != 4) 116 goto literal; 117 (*set)[setranges].start = rstr[i + 2]; 118 (*set)[setranges].end = rstr[i + 2]; 119 (*set)[setranges].quant = 1; 120 setranges++; 121 i = j; 122 continue; 123 } 124 125 /* CLASSES [:CLASS:] */ 126 if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') { 127 for (n = 0; n < LEN(classes); n++) { 128 if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) { 129 *checks |= 1 << n; 130 i = j; 131 break; 132 } 133 } 134 if (n < LEN(classes)) 135 continue; 136 eprintf("Invalid character class.\n"); 137 } 138 139 /* REPEAT [_*n] (only allowed in set2) */ 140 if (j - i > 2 && rstr[i + 2] == '*') { 141 /* check if right side of '*' is a number */ 142 q = 0; 143 factor = 1; 144 base = (rstr[i + 3] == '0') ? 8 : 10; 145 for (n = j - 1; n > i + 2; n--) { 146 if (rstr[n] < '0' || rstr[n] > '9') { 147 n = 0; 148 break; 149 } 150 q += (rstr[n] - '0') * factor; 151 factor *= base; 152 } 153 if (n == 0) { 154 j = m + 1; 155 goto nextbrack; 156 } 157 (*set)[setranges].start = rstr[i + 1]; 158 (*set)[setranges].end = rstr[i + 1]; 159 (*set)[setranges].quant = q ? q : setlen(set1, MAX(set1ranges, 1)); 160 setranges++; 161 i = j; 162 continue; 163 } 164 165 j = m + 1; 166 goto nextbrack; 167 } 168 literal: 169 /* RANGES [_-__-_], _-__-_ */ 170 /* LITERALS _______ */ 171 (*set)[setranges].start = rstr[i]; 172 173 if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i]) 174 i += 2; 175 (*set)[setranges].end = rstr[i]; 176 (*set)[setranges].quant = 1; 177 setranges++; 178 } 179 180 free(rstr); 181 return setranges; 182 } 183 184 static void 185 usage(void) 186 { 187 eprintf("usage: %s [-cCds] set1 [set2]\n", argv0); 188 } 189 190 int 191 main(int argc, char *argv[]) 192 { 193 Rune r, lastrune = 0; 194 size_t off1, off2, i, m; 195 int ret = 0; 196 197 ARGBEGIN { 198 case 'c': 199 case 'C': 200 cflag = 1; 201 break; 202 case 'd': 203 dflag = 1; 204 break; 205 case 's': 206 sflag = 1; 207 break; 208 default: 209 usage(); 210 } ARGEND 211 212 if (!argc || argc > 2 || (dflag == sflag && argc != 2) || 213 (dflag && argc != 1)) 214 usage(); 215 216 set1ranges = makeset(argv[0], &set1, &set1checks); 217 if (argc == 2) { 218 set2ranges = makeset(argv[1], &set2, &set2checks); 219 /* sanity checks as we are translating */ 220 if (!set2ranges && !set2checks) 221 eprintf("cannot map to an empty set.\n"); 222 if (set2checks && set2checks != ISLOWERBIT && 223 set2checks != ISUPPERBIT) { 224 eprintf("can only map to 'lower' and 'upper' class.\n"); 225 } 226 } 227 read: 228 if (!efgetrune(&r, stdin, "<stdin>")) { 229 ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>"); 230 return ret; 231 } 232 if (argc == 1 && sflag) 233 goto write; 234 for (i = 0, off1 = 0; i < set1ranges; off1 += rangelen(set1[i]), i++) { 235 if (set1[i].start <= r && r <= set1[i].end) { 236 if (dflag) { 237 if (cflag) 238 goto write; 239 else 240 goto read; 241 } 242 if (cflag) 243 goto write; 244 245 /* map r to set2 */ 246 if (set2checks) { 247 if (set2checks == ISLOWERBIT) 248 r = tolowerrune(r); 249 else 250 r = toupperrune(r); 251 } else { 252 off1 += r - set1[i].start; 253 if (off1 > setlen(set2, set2ranges) - 1) { 254 r = set2[set2ranges - 1].end; 255 goto write; 256 } 257 for (m = 0, off2 = 0; m < set2ranges; m++) { 258 if (off2 + rangelen(set2[m]) > off1) { 259 m++; 260 break; 261 } 262 off2 += rangelen(set2[m]); 263 } 264 m--; 265 r = set2[m].start + (off1 - off2) / set2[m].quant; 266 } 267 goto write; 268 } 269 } 270 if (check(r, set1checks)) { 271 if (cflag) 272 goto write; 273 if (dflag) 274 goto read; 275 if (set2checks) { 276 if (set2checks == ISLOWERBIT) 277 r = tolowerrune(r); 278 else 279 r = toupperrune(r); 280 } else { 281 r = set2[set2ranges - 1].end; 282 } 283 goto write; 284 } 285 if (!dflag && cflag) { 286 if (set2checks) { 287 if (set2checks == ISLOWERBIT) 288 r = tolowerrune(r); 289 else 290 r = toupperrune(r); 291 } else { 292 r = set2[set2ranges - 1].end; 293 } 294 goto write; 295 } 296 if (dflag && cflag) 297 goto read; 298 write: 299 if (argc == 1 && sflag && r == lastrune) { 300 if (check(r, set1checks)) 301 goto read; 302 for (i = 0; i < set1ranges; i++) { 303 if (set1[i].start <= r && r <= set1[i].end) 304 goto read; 305 } 306 } 307 if (argc == 2 && sflag && r == lastrune) { 308 if (set2checks && check(r, set2checks)) 309 goto read; 310 for (i = 0; i < set2ranges; i++) { 311 if (set2[i].start <= r && r <= set2[i].end) 312 goto read; 313 } 314 } 315 efputrune(&r, stdout, "<stdout>"); 316 lastrune = r; 317 goto read; 318 }