I need to write a Go script that will open a big CSV file, and create new, separate CSVs based on the value of the first element of each line.
The CSV file looks like this:
"country", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
So, what I am trying to do is creating a file with the first field's value (e.g. AT.csv
), containing all the lines that start with that value.
The following is the script that I have written so far:
package main
import (
"encoding/csv"
"fmt"
"os"
)
func main() {
// contentCreated := make(chan map[string]string)
createContent("union_exp.csv")
}
func createContent(csvfilename string) {
keys := ""
content := make(map[string]string)
csvfile, err := os.Open(csvfilename)
if err != nil {
fmt.Println(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
reader.FieldsPerRecord = -1
rawCSVdata, err := reader.ReadAll()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for i, each := range rawCSVdata {
if i == 0 {
keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"
"
} else {
stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"
"
if i%10000 == 0 {
fmt.Println(i)
}
exists := Exists(content, each[0])
if !exists {
content[each[0]] = keys
}
content[each[0]] += stringtoadd
createFile(each[0], content[each[0]])
}
}
}
func createFile(name, content string) {
f, _ := os.Create(name + ".csv")
f.WriteString(content)
f.Close()
}
func Exists(content map[string]string, name string) bool {
_, exists := content[name]
return exists
}
The problem I am having at the moment is that the performances are quite slow. I even have a similar script written in PHP which is executing the same operation way faster than this. And that obviously makes me think that there must be something wrong with my Go script.
Can someone help me to understand what is wrong with it?
Thank you!
You are (unnecessarily) loading the complete CVS file at once and overwriting the files every time the contents would change.
Try the following:
package main
import (
"encoding/csv"
"fmt"
"os"
"sync"
)
func main() {
input, err := os.Open("union_exp.csv")
if err != nil {
fmt.Println("Error while opening CSV file.")
return
}
defer input.Close()
reader := csv.NewReader(input)
reader.FieldsPerRecord = -1
files := make(map[string]chan []string)
keys, err := reader.Read()
if err != nil {
fmt.Println("Error while reading CSV file.")
return
}
wg := &sync.WaitGroup{}
var line []string
for line, err = reader.Read(); err == nil; line, err = reader.Read() {
ch, ok := files[line[0]]
if ok {
ch <- line
} else {
ch = make(chan []string, 8)
wg.Add(1)
go fileWriter(line[0], ch, wg)
ch <- keys
files[line[0]] = ch
}
}
if err.Error() != "EOF" {
fmt.Println("Error while reading CSV file.")
return
}
for _, ch := range files {
close(ch)
}
wg.Wait()
fmt.Println("Done!")
}
func fileWriter(fileName string, ch chan []string, wg *sync.WaitGroup) {
defer wg.Done()
file, err := os.Create("x" + fileName + ".csv")
if err != nil {
fmt.Println("Error while creating output file.")
os.Exit(1) // Kill the whole app
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
for line := range ch {
writer.Write(line)
}
}
I second the @plusmid' answer - the vast majority of time your program spends on opening/(over)writing/closing files.
So, first of all, fix this bug, and write content only once for each key:
package main
import (
"encoding/csv"
"fmt"
"os"
)
func main() {
// contentCreated := make(chan map[string]string)
createContent("union_exp.csv")
}
func createContent(csvfilename string) {
keys := ""
content := make(map[string]string)
csvfile, err := os.Open(csvfilename)
if err != nil {
fmt.Println(err)
}
defer csvfile.Close()
reader := csv.NewReader(csvfile)
reader.FieldsPerRecord = -1
rawCSVdata, err := reader.ReadAll()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for i, each := range rawCSVdata {
if i == 0 {
keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"
"
} else {
stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"
"
if i%10000 == 0 {
fmt.Println(i)
}
exists := Exists(content, each[0])
if !exists {
content[each[0]] = keys
}
content[each[0]] += stringtoadd
}
}
for key, content := range content {
createFile(key, content)
}
}
func createFile(name, content string) {
f, _ := os.Create(name + ".csv")
f.WriteString(content)
f.Close()
}
func Exists(content map[string]string, name string) bool {
_, exists := content[name]
return exists
}
On 25k CSV it gives me 50 -> 5 secs speed increase.
Next, think about using goroutines to parse file in parallel. Now you're using only single core. Also, there are some more issues like using + operator to concatenate strings, which is generally slower than fmt.Sprintf(). You have a lot of space to optimize code here.