Recently I was tasked with a file cleansing exercise. The endeavor looked simple, but the solution was not, I needed to prepare a file for processing, but wanted to first proof concept my approach so I opted to utilizing this opportunity to learn PoweShell.
More about the task: I was supplied with a series of CSV files in which the data looked like
As you can see this is something I think we may call alternating double-quotes comma delimited file with embedded commas, I simply needed to remove the occasional quotes and replace the delimiter to pipe to be able to process the file preserving the original quotes. You may notice, I blogged in the previous post I was able to achieve the same using MS SSIS Script Transformation which basically uses some .Net code. To be fair, I did not come up with the algorithm, turned out my colleague Danny T. had a very extensive experience in textual data processing, so I was just the implementer.
The script turned out to work as expected, but alas there is a limitation to PowerShell’s get-content Cmdlet on how long data file it can digest. So the script may not be bale to process the entire input file. I guess the workaround is in using a revised version that utilizes the .Net stream class, hope somebody would share it with me someday.
1: # Author: Arthur Zubarev
2: # Notes: Removes " and replaces the delimiter to |
3: # In DOS batch usage:
4: # Powershell.exe "& 'D:\PSScripts\RemoveQuotes-ReplaceDelim.ps1'" -InputCSV ""D:\Input.csv"" -NonInteractive -WindowStyle hidden -ExecutionPolicy RemoteSigned
5:
6: param([string]$InputCSV = "")
7:
8: # Temp assignment!
9: $InputCSV = "D:\Input.csv"
10:
11: $process_yes_no = 0
12:
13: if ( $InputCSV -eq "" ) {
14: Write-Host "Input parameter is missing. Usage: .\RemoveQuotes-ReplaceDelim.ps1 -InputCSV <file path>" -ForegroundColor red -BackgroundColor yellow
15: Exit
16: }
17:
18: function GetScriptDirectory {
19: $Invocation = (Get-Variable MyInvocation -Scope 1).Value
20: Split-Path $Invocation.MyCommand.Path
21: }
22:
23: Write-Host "Processing" $InputCSV -foregroundcolor green -backgroundcolor black
24:
25: # make the "fixed" file up
26: $fixed_file = Join-Path(GetScriptDirectory) "fixed.csv";
27:
28: Write-Host "Saving fixed file to" $fixed_file -foregroundcolor yellow -backgroundcolor black
29:
30: # read the input csv as binary
31: $reader = [System.IO.File]::OpenText($InputCSV)
32: try {
33: for(;;) {
34: $to_repair_t = $reader.ReadLine()
35: if ($to_repair_t -eq $null) {
36: break
37: }
38: $to_repair = get-content -encoding byte $to_repair_t
39: $fixed = New-Object System.Collections.ArrayList
40: $byteEncoder = New-Object System.Text.ASCIIEncoding
41: $in_quotes = 0
42:
43: foreach ($c in $to_repair) {
44: # toggle in-quotes flag
45: if ($c -eq 34) {
46: $in_quotes = !$in_quotes;
47: $process_yes_no = 1
48: }
49:
50: # replace comma in quotes with ~
51: if ($c -eq 44 -and $in_quotes) {
52: foreach ($b in $byteEncoder.GetBytes("~")) {
53: [void]$fixed.Add($b);
54: }
55: continue;
56: }
57:
58: # pass through the bytes
59: [void]$fixed.Add($c);
60: }
61:
62: if ($process_yes_no -eq 1 ) {
63: # write-out to fixed csv
64: set-content -encoding byte -path $fixed_file -value $fixed
65:
66: # carry on with the commas replacements
67: (Get-Content $fixed_file) |
68: Foreach-Object {$_ -replace ",", "|"} |
69: Set-Content $fixed_file
70:
71: # next do ~ replacements
72: (Get-Content $fixed_file) |
73: Foreach-Object {$_ -replace "~", ","} |
74: Set-Content $fixed_file
75:
76: # last fix step " removals
77: (Get-Content $fixed_file) |
78: Foreach-Object {$_ -replace """", ""} |
79: Set-Content $fixed_file
80:
81: # Swap the files
82: Move-Item $fixed_file $InputCSV -Force -ErrorAction SilentlyContinue
83:
84: #$File_Name = split-path $InputCSV -Leaf # in case the file name needs to be captured
85: }
86: else {
87: Write-Host "No processing was necessary."
88: }
89: }}
90: finally {
91: $reader.Close()
92: }
93:
94: Write-Host "Processing complete." -BackgroundColor DarkGreen -ForegroundColor yellow