1 /** 2 Command line tool that identifies equivalent lines in an input stream. Equivalent 3 lines are identified using either the full line or a set of fields as the key. By 4 default, input is written to standard output, retaining only the first occurrence of 5 equivalent lines. There are also options for marking and numbering equivalent lines 6 rather, without filtering out duplicates. 7 8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences. 9 First, the key can be composed of individual fields, not just the full line. Second, 10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when 11 they are adjacent, hence the usual need for sorting.) 12 13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with 14 an equivalence ID, which is a one-upped counter. The other is to number lines, with 15 each unique key have its own set of numbers. 16 17 Copyright (c) 2015-2020, eBay Inc. 18 Initially written by Jon Degenhardt 19 20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 21 */ 22 module tsv_utils.tsv_uniq; 23 24 import std.exception : enforce; 25 import std.format : format; 26 import std.range; 27 import std.stdio; 28 import std.typecons : tuple; 29 30 auto helpText = q"EOS 31 Synopsis: tsv-uniq [options] [file...] 32 33 tsv-uniq filters out duplicate lines using fields as a key. Filtering is 34 based on the entire line when key fields are not provided. Options are 35 also available for assigning a unique id to each key and numbering the 36 occurrences of each key. Use '--help-verbose' for more details. 37 38 Options: 39 EOS"; 40 41 auto helpTextVerbose = q"EOS 42 Synopsis: tsv-uniq [options] [file...] 43 44 tsv-uniq identifies equivalent lines in tab-separated value files. Input 45 is read line by line, recording a key for each line based on one or more 46 of the fields. Two lines are equivalent if they have the same key. The 47 first time a key is seen its line is written to standard output. 48 Subsequent lines containing the same key are discarded. This command 49 uniq's a file on fields 2 and 3: 50 51 tsv-uniq -f 2,3 file.tsv 52 53 This is similar to the Unix 'uniq' program, but based on individual 54 fields and without requiring sorted data. 55 56 tsv-uniq can be run without specifying a key field. In this case the 57 whole line is used as a key, same as the Unix 'uniq' program. This works 58 on any line-oriented text file, not just TSV files. 59 60 The above is the default behavior ('uniq' mode). The alternates to 'uniq' 61 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all 62 lines are written to standard output, but with a field appended marking 63 equivalent entries with an ID. The ID is a one-upped counter. Example: 64 65 tsv-uniq --header -f 2,3 --equiv file.tsv 66 67 'Number' mode also writes all lines to standard output, but with a field 68 appended numbering the occurrence count for the line's key. The first line 69 with a specific key is assigned the number '1', the second with the key is 70 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined. 71 72 The '--r|repeated' option can be used to print only lines occurring more 73 than once. Specifically, the second occurrence of a key is printed. The 74 '--a|at-least N' option is similar, printing lines occurring at least N 75 times. (Like repeated, the Nth line with the key is printed.) 76 77 The '--m|max MAX' option changes the behavior to output the first MAX 78 lines for each key, rather than just the first line for each key. 79 80 If both '--a|at-least' and '--m|max' are specified, the occurrences 81 starting with 'at-least' and ending with 'max' are output. 82 83 Options: 84 EOS"; 85 86 /** Container for command line options. 87 */ 88 struct TsvUniqOptions 89 { 90 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 91 92 enum defaultEquivHeader = "equiv_id"; 93 enum defaultEquivStartID = 1; 94 enum defaultNumberHeader = "equiv_line"; 95 96 string programName; 97 InputSourceRange inputSources; // Input files 98 bool helpVerbose = false; // --h|help-verbose 99 bool versionWanted = false; // --V|version 100 size_t[] fields; // --f|fields 101 bool hasHeader = false; // --H|header 102 bool onlyRepeated = false; // --r|repeated. Shorthand for '--atleast 2' 103 size_t atLeast = 0; // --a|at-least. Zero implies default behavior. 104 size_t max = 0; // --m|max. Zero implies default behavior. 105 bool numberMode = false; // --z|number 106 string numberHeader = defaultNumberHeader; // --number-header 107 bool equivMode = false; // --e|equiv 108 string equivHeader = defaultEquivHeader; // --equiv-header 109 long equivStartID = defaultEquivStartID; // --equiv-start 110 bool ignoreCase = false; // --i|ignore-case 111 char delim = '\t'; // --d|delimiter 112 bool keyIsFullLine = false; // Derived. True if no fields specified or '--f|fields 0' 113 114 /* Returns a tuple. First value is true if command line arguments were successfully 115 * processed and execution should continue, or false if an error occurred or the user 116 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 117 * 118 * Returning true (execution continues) means args have been validated and derived 119 * values calculated. In addition, field indices have been converted to zero-based. 120 * If the whole line is the key, the individual fields list will be cleared. 121 * 122 * Repeat count control variables 'atLeast' and max' - These values are left at zero 123 * if no repeat count options are specified. They are set if repeat count options 124 * are specified, as follows: 125 * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. 126 * --r|repeated option sets it 2, --a|at-least sets it to the specified value. 127 * * max - Default to zero. Is set to the --m|max value if provided. Is set to 128 * 'atLeast' if --r|repeated or --a|at-least is provided. 129 * 130 * An exception to the above: If --e|equiv-mode is specified, then (max == 0) 131 * represents the default "output all values" case. In this case max may be less 132 * than the at-least value. 133 */ 134 auto processArgs (ref string[] cmdArgs) 135 { 136 import std.algorithm : all, each; 137 import std.getopt; 138 import std.path : baseName, stripExtension; 139 import std.typecons : Yes, No; 140 import tsv_utils.common.utils : makeFieldListOptionHandler; 141 142 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 143 144 try 145 { 146 arraySep = ","; // Use comma to separate values in command line options 147 auto r = getopt( 148 cmdArgs, 149 "help-verbose", " Print full help.", &helpVerbose, 150 std.getopt.config.caseSensitive, 151 "V|version", " Print version information and exit.", &versionWanted, 152 "H|header", " Treat the first line of each file as a header.", &hasHeader, 153 std.getopt.config.caseInsensitive, 154 155 "f|fields", "<field-list> Fields to use as the key. Default: 0 (entire line).", 156 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero), 157 158 "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, 159 "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, 160 "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, 161 "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, 162 "e|equiv", " Output equivalence class IDs rather than uniq'ing entries.", &equivMode, 163 "equiv-header", "STR Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader, 164 "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, 165 "z|number", " Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode, 166 "number-header", "STR Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader, 167 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 168 ); 169 170 if (r.helpWanted) 171 { 172 defaultGetoptPrinter(helpText, r.options); 173 return tuple(false, 0); 174 } 175 else if (helpVerbose) 176 { 177 defaultGetoptPrinter(helpTextVerbose, r.options); 178 return tuple(false, 0); 179 } 180 else if (versionWanted) 181 { 182 import tsv_utils.common.tsvutils_version; 183 writeln(tsvutilsVersionNotice("tsv-uniq")); 184 return tuple(false, 0); 185 } 186 187 /* Input files. Remaining command line args are files. */ 188 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 189 cmdArgs.length = 1; 190 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 191 inputSources = inputSourceRange(filepaths, readHeader); 192 193 /* Consistency checks */ 194 if (!equivMode) 195 { 196 enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv"); 197 enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv"); 198 } 199 200 enforce(numberMode || numberHeader == defaultNumberHeader, 201 "--number-header requires --z|number"); 202 203 enforce(fields.length <= 1 || fields.all!(x => x != 0), 204 "Whole line as key (--f|field 0) cannot be combined with multiple fields."); 205 206 /* Derivations */ 207 if (fields.length == 0) 208 { 209 keyIsFullLine = true; 210 } 211 else if (fields.length == 1 && fields[0] == 0) 212 { 213 keyIsFullLine = true; 214 fields.length = 0; 215 } 216 217 if (onlyRepeated && atLeast <= 1) atLeast = 2; 218 if (atLeast >= 2 && max < atLeast) 219 { 220 // Don't modify max if it is zero and equivMode or numberMode is in effect. 221 if (max != 0 || (!equivMode && !numberMode)) max = atLeast; 222 } 223 224 if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 0-based indexing. 225 226 } 227 catch (Exception exc) 228 { 229 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 230 return tuple(false, 1); 231 } 232 return tuple(true, 0); 233 } 234 } 235 236 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 237 238 /** Main program. Processes command line arguments and calls tsvUniq which implements 239 * the main processing logic. 240 */ 241 int main(string[] cmdArgs) 242 { 243 /* When running in DMD code coverage mode, turn on report merging. */ 244 version(D_Coverage) version(DigitalMars) 245 { 246 import core.runtime : dmd_coverSetMerge; 247 dmd_coverSetMerge(true); 248 } 249 250 TsvUniqOptions cmdopt; 251 auto r = cmdopt.processArgs(cmdArgs); 252 if (!r[0]) return r[1]; 253 254 version(LDC_Profile) 255 { 256 import ldc.profile : resetAll; 257 resetAll(); 258 } 259 260 try tsvUniq(cmdopt); 261 catch (Exception exc) 262 { 263 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 264 return 1; 265 } 266 return 0; 267 } 268 269 /** Outputs the unique lines from all the input files. 270 * 271 * Processes the lines in each input file. All lines are added to an associated array. 272 * The first time a line is seen it is output. If key fields are being used these are 273 * used as the basis for the associative array entries rather than the full line. 274 */ 275 void tsvUniq(ref TsvUniqOptions cmdopt) 276 { 277 import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange, 278 InputFieldReordering, InputSourceRange, joinAppend, throwIfWindowsNewlineOnUnix; 279 import std.algorithm : splitter; 280 import std.array : appender; 281 import std.conv : to; 282 import std.uni : asLowerCase; 283 import std.utf : byChar; 284 285 /* inputSources must be an InputSourceRange and include at least stdin. */ 286 assert(!cmdopt.inputSources.empty); 287 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 288 289 /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 290 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields); 291 292 /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 293 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 294 295 /* The master hash. The key is the specified fields concatenated together (including 296 * separators). The value is a struct with the equiv-id and occurrence count. 297 */ 298 static struct EquivEntry { size_t equivID; size_t count; } 299 EquivEntry[string] equivHash; 300 301 /* Reusable buffers for multi-field keys and case-insensitive keys. */ 302 auto multiFieldKeyBuffer = appender!(char[]); 303 auto lowerKeyBuffer = appender!(char[]); 304 305 const size_t numKeyFields = cmdopt.fields.length; 306 long nextEquivID = cmdopt.equivStartID; 307 308 /* First header is read during command line arg processing. */ 309 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 310 { 311 auto inputStream = cmdopt.inputSources.front; 312 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 313 314 bufferedOutput.append(inputStream.header); 315 316 if (cmdopt.equivMode) 317 { 318 bufferedOutput.append(cmdopt.delim); 319 bufferedOutput.append(cmdopt.equivHeader); 320 } 321 322 if (cmdopt.numberMode) 323 { 324 bufferedOutput.append(cmdopt.delim); 325 bufferedOutput.append(cmdopt.numberHeader); 326 } 327 328 bufferedOutput.appendln(); 329 } 330 331 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 332 333 foreach (inputStream; cmdopt.inputSources) 334 { 335 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 336 337 foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 338 { 339 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 340 341 /* Start by finding the key. */ 342 typeof(line) key; 343 if (cmdopt.keyIsFullLine) 344 { 345 key = line; 346 } 347 else 348 { 349 assert(keyFieldsReordering !is null); 350 351 /* Copy the key fields to a new buffer. */ 352 keyFieldsReordering.initNewLine; 353 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 354 { 355 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 356 if (keyFieldsReordering.allFieldsFilled) break; 357 } 358 359 enforce(keyFieldsReordering.allFieldsFilled, 360 format("Not enough fields in line. File: %s, Line: %s", 361 inputStream.name, lineNum)); 362 363 if (numKeyFields == 1) 364 { 365 key = keyFieldsReordering.outputFields[0]; 366 } 367 else 368 { 369 multiFieldKeyBuffer.clear(); 370 keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim); 371 key = multiFieldKeyBuffer.data; 372 } 373 } 374 375 if (cmdopt.ignoreCase) 376 { 377 /* Equivalent to key = key.toLower, but without memory allocation. */ 378 lowerKeyBuffer.clear(); 379 lowerKeyBuffer.put(key.asLowerCase.byChar); 380 key = lowerKeyBuffer.data; 381 } 382 383 bool isOutput = false; 384 EquivEntry currEntry; 385 EquivEntry* priorEntry = (key in equivHash); 386 if (priorEntry is null) 387 { 388 isOutput = (cmdopt.atLeast <= 1); 389 currEntry.equivID = nextEquivID; 390 currEntry.count = 1; 391 equivHash[key.to!string] = currEntry; 392 nextEquivID++; 393 } 394 else 395 { 396 (*priorEntry).count++; 397 currEntry = *priorEntry; 398 399 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 400 (cmdopt.equivMode && cmdopt.max == 0) || 401 (cmdopt.numberMode && cmdopt.max == 0)) 402 { 403 isOutput = true; 404 } 405 } 406 407 if (isOutput) 408 { 409 bufferedOutput.append(line); 410 411 if (cmdopt.equivMode) 412 { 413 bufferedOutput.append(cmdopt.delim); 414 bufferedOutput.append(currEntry.equivID.to!string); 415 } 416 417 if (cmdopt.numberMode) 418 { 419 bufferedOutput.append(cmdopt.delim); 420 bufferedOutput.append(currEntry.count.to!string); 421 } 422 423 bufferedOutput.appendln(); 424 } 425 } 426 } 427 }