1 /** 2 Command line tool using fields in a tab-separated value file to identify equivalent 3 lines. Can either remove the duplicate entries or mark as equivalence classes. 4 5 This tool reads a tab-separated value file line by line, using one or more fields to 6 record a key. If the same key is found in a subsequent line, it is identified as 7 equivalent. When operating in 'uniq' mode, the first time a key is seen the line is 8 written to standard output, but subsequent matching lines are discarded. 9 10 The alternate to 'uniq' is 'equiv-class' identification. In this mode, all lines 11 written to standard output, but a new field is added marking equivalent entries with 12 with an ID. The ID is simply a one-upped counter. 13 14 Copyright (c) 2015-2018, eBay Software Foundation 15 Initially written by Jon Degenhardt 16 17 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 18 */ 19 module tsv_uniq; 20 21 import std.stdio; 22 import std.format : format; 23 import std.typecons : tuple; 24 25 auto helpText = q"EOS 26 Synopsis: tsv-uniq [options] [file...] 27 28 tsv-uniq filters out duplicate lines using fields as a key. Filtering is based 29 on the entire line if a key is not provided. 30 31 Options: 32 EOS"; 33 34 auto helpTextVerbose = q"EOS 35 Synopsis: tsv-uniq [options] [file...] 36 37 tsv-uniq identifies equivalent lines in tab-separated value files. Input is read 38 line by line, recording a key based on one or more of the fields. Two lines are 39 equivalent if they have the same key. When operating in 'uniq' mode, the first 40 time a key is seen the line is written to standard output, but subsequent lines 41 are discarded. This is similar to the unix 'uniq' program, but based on individual 42 fields and without requiring sorted data. This command uniq's on fields 2 and 3: 43 44 tsv-uniq -f 2,3 file.tsv 45 46 The alternate to 'uniq' mode is 'equiv-class' identification. In this mode, all 47 lines are written to standard output, but with a new field added marking 48 equivalent entries with an ID. The ID is simply a one-upped counter. Example: 49 50 tsv-uniq --header -f 2,3 --equiv file.tsv 51 52 tsv-uniq can be run without specifying a key field. In this case the whole line 53 is used as a key, same as the Unix 'uniq' program. This works on any line-oriented 54 text file, not just TSV files. 55 56 The '--r|repeated' option can be used to print only lines occurring more than 57 once. '--a|at-least N' is similar, except that it only prints lines occuring at 58 least N times. For both, the Nth line found is printed, in the order found. 59 60 The '--m|max MAX' option changes the behavior to output the first MAX lines for 61 each key, rather than just the first line for each key. This can also with used 62 with '--e|equiv' to limit the number output for each equivalence class. 63 64 It's not obvious when both '--a|at-least' and '--m|max' might be useful, but, if 65 both are specified, the occurrences between 'at-least' and 'max' are output. 66 67 Options: 68 EOS"; 69 70 /** 71 Container for command line options. 72 */ 73 struct TsvUniqOptions 74 { 75 enum defaultEquivHeader = "equiv_id"; 76 enum defaultEquivStartID = 1; 77 78 string programName; 79 bool helpVerbose = false; // --h|help-verbose 80 bool versionWanted = false; // --V|version 81 size_t[] fields; // --f|fields 82 bool hasHeader = false; // --H|header 83 bool onlyRepeated = false; // --r|repeated. Shorthand for '--atleast 2' 84 size_t atLeast = 0; // --a|at-least. Zero implies default behavior. 85 size_t max = 0; // --m|max. Zero implies default behavior. 86 bool equivMode = false; // --e|equiv 87 string equivHeader = defaultEquivHeader; // --equiv-header 88 long equivStartID = defaultEquivStartID; // --equiv-start 89 bool ignoreCase = false; // --i|ignore-case 90 char delim = '\t'; // --d|delimiter 91 bool keyIsFullLine = false; // Derived. True if no fields specified or '--f|fields 0' 92 93 /* Returns a tuple. First value is true if command line arguments were successfully 94 * processed and execution should continue, or false if an error occurred or the user 95 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 96 * 97 * Returning true (execution continues) means args have been validated and derived 98 * values calculated. In addition, field indices have been converted to zero-based. 99 * If the whole line is the key, the individual fields list will be cleared. 100 * 101 * Repeat count control variables 'atLeast' and max' - The general idea is that the 102 * values are left at zero if no repeat count options are specified. They are set if 103 * repeat count options are specified, as follows: 104 * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. 105 * --r|repeated option sets it 2, --a|at-least sets it to the specified value. 106 * * max - Default to zero. Is set to the --m|max value if provided. Is set to 107 * 'atLeast' if --r|repeated or --a|at-least is provided. 108 * 109 * An exception to the above: If --e|equiv-mode is specified, then (max == 0) 110 * represents the default "output all values" case. In this case max may be less 111 * than the at-least value. 112 */ 113 auto processArgs (ref string[] cmdArgs) 114 { 115 import std.algorithm : any, each; 116 import std.getopt; 117 import std.path : baseName, stripExtension; 118 import std.typecons : Yes, No; 119 import tsvutil : makeFieldListOptionHandler; 120 121 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 122 123 try 124 { 125 arraySep = ","; // Use comma to separate values in command line options 126 auto r = getopt( 127 cmdArgs, 128 "help-verbose", " Print full help.", &helpVerbose, 129 std.getopt.config.caseSensitive, 130 "V|version", " Print version information and exit.", &versionWanted, 131 "H|header", " Treat the first line of each file as a header.", &hasHeader, 132 std.getopt.config.caseInsensitive, 133 134 "f|fields", "<field-list> Fields to use as the key. Default: 0 (entire line).", 135 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero), 136 137 "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, 138 "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, 139 "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, 140 "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, 141 "e|equiv", " Output equiv class IDs rather than uniq'ing entries.", &equivMode, 142 "equiv-header", "STR Use STR as the equiv-id field header. Applies when using '--header --equiv'. Default: 'equiv_id'.", &equivHeader, 143 "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, 144 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 145 ); 146 147 if (r.helpWanted) 148 { 149 defaultGetoptPrinter(helpText, r.options); 150 return tuple(false, 0); 151 } 152 else if (helpVerbose) 153 { 154 defaultGetoptPrinter(helpTextVerbose, r.options); 155 return tuple(false, 0); 156 } 157 else if (versionWanted) 158 { 159 import tsvutils_version; 160 writeln(tsvutilsVersionNotice("tsv-uniq")); 161 return tuple(false, 0); 162 } 163 164 /* Consistency checks */ 165 if (!equivMode) 166 { 167 if (equivHeader != defaultEquivHeader) 168 { 169 throw new Exception("--equiv-header requires --e|equiv"); 170 } 171 else if (equivStartID != defaultEquivStartID) 172 { 173 throw new Exception("--equiv-start requires --e|equiv"); 174 } 175 } 176 177 if (fields.length > 1 && fields.any!(x => x == 0)) 178 { 179 throw new Exception("Whole line as key (--f|field 0) cannot be combined with multiple fields."); 180 } 181 182 /* Derivations */ 183 if (fields.length == 0) 184 { 185 keyIsFullLine = true; 186 } 187 else if (fields.length == 1 && fields[0] == 0) 188 { 189 keyIsFullLine = true; 190 fields.length = 0; 191 } 192 193 if (onlyRepeated && atLeast <= 1) atLeast = 2; 194 if (atLeast >= 2 && max < atLeast) 195 { 196 // Don't modify max if it is zero and equivMode is in effect. 197 if (max != 0 || !equivMode) max = atLeast; 198 } 199 200 fields.each!((ref x) => --x); // Convert to 1-based indexing. 201 202 } 203 catch (Exception exc) 204 { 205 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 206 return tuple(false, 1); 207 } 208 return tuple(true, 0); 209 } 210 } 211 212 int main(string[] cmdArgs) 213 { 214 /* When running in DMD code coverage mode, turn on report merging. */ 215 version(D_Coverage) version(DigitalMars) 216 { 217 import core.runtime : dmd_coverSetMerge; 218 dmd_coverSetMerge(true); 219 } 220 221 TsvUniqOptions cmdopt; 222 auto r = cmdopt.processArgs(cmdArgs); 223 if (!r[0]) return r[1]; 224 try tsvUniq(cmdopt, cmdArgs[1..$]); 225 catch (Exception exc) 226 { 227 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 228 return 1; 229 } 230 return 0; 231 } 232 233 void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles) 234 { 235 import tsvutil : InputFieldReordering, BufferedOutputRange; 236 import std.algorithm : splitter; 237 import std.array : join; 238 import std.conv : to; 239 import std.range; 240 import std.uni : toLower; 241 242 /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 243 auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.fields); 244 245 /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 246 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 247 248 /* The master hash. The key is the specified fields concatenated together (including 249 * separators). The value is a struct with the equiv-id and occurrence count. 250 */ 251 struct EquivEntry { size_t equivID; size_t count; } 252 EquivEntry[string] equivHash; 253 254 size_t numFields = cmdopt.fields.length; 255 long nextEquivID = cmdopt.equivStartID; 256 bool headerWritten = false; 257 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 258 { 259 auto inputStream = (filename == "-") ? stdin : filename.File(); 260 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 261 { 262 if (cmdopt.hasHeader && lineNum == 1) 263 { 264 /* Header line. */ 265 if (!headerWritten) 266 { 267 bufferedOutput.append(line); 268 if (cmdopt.equivMode) 269 { 270 bufferedOutput.append(cmdopt.delim); 271 bufferedOutput.append(cmdopt.equivHeader); 272 } 273 bufferedOutput.appendln(); 274 headerWritten = true; 275 } 276 } 277 else 278 { 279 /* Regular line (not header). Start by finding the key. */ 280 typeof(line) key; 281 if (cmdopt.keyIsFullLine) 282 { 283 key = line; 284 } 285 else 286 { 287 /* Copy the key fields to a new buffer. */ 288 keyFieldsReordering.initNewLine; 289 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 290 { 291 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 292 if (keyFieldsReordering.allFieldsFilled) break; 293 } 294 295 if (!keyFieldsReordering.allFieldsFilled) 296 { 297 throw new Exception( 298 format("Not enough fields in line. File: %s, Line: %s", 299 (filename == "-") ? "Standard Input" : filename, lineNum)); 300 } 301 302 key = keyFieldsReordering.outputFields.join(cmdopt.delim); 303 } 304 305 if (cmdopt.ignoreCase) key = key.toLower; 306 307 bool isOutput = false; 308 EquivEntry currEntry; 309 EquivEntry* priorEntry = (key in equivHash); 310 if (priorEntry is null) 311 { 312 isOutput = (cmdopt.atLeast <= 1); 313 currEntry.equivID = nextEquivID; 314 currEntry.count = 1; 315 equivHash[key.to!string] = currEntry; 316 nextEquivID++; 317 } 318 else 319 { 320 (*priorEntry).count++; 321 currEntry = *priorEntry; 322 323 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 324 (cmdopt.equivMode && cmdopt.max == 0)) 325 { 326 isOutput = true; 327 } 328 } 329 330 if (isOutput) 331 { 332 bufferedOutput.append(line); 333 if (cmdopt.equivMode) 334 { 335 bufferedOutput.append(cmdopt.delim); 336 bufferedOutput.append(currEntry.equivID.to!string); 337 } 338 bufferedOutput.appendln(); 339 } 340 } 341 } 342 } 343 }