将2个大型CSV文件合并为一个文件

I have a archive file that is about 50GB in size.

Each week, I have to take a CSV file and merge it with the very large 50GB CSV file.

I'm new to Go, and was hoping for a nice elagent solution in Go.

The files look like:

"a:123", 101010
"b:123", 101010
"some-key-here:123", 101010
"some-key-here:234", 101010

While I have not compiled it myself to check, this ought to do what you want, once you implement the compare() function. It is essentially the "merge" step of the Mergesort algorithm. Since you've already got the two files in sorted order, you only need the merge step, which can be done in streaming fashion.

package main

import (
    "encoding/csv"
    "io"
    "log"
    "os"
)

const outFile = "your/output/file/path.ext"

func main() {
    // make sure there are only 2 args
    if len(os.Args) != 3 {
        log.Panic("
Usage: command file1 file2")
    }

    // open the first file
    f1, e := os.Open(os.Args[1])
    if e != nil {
        log.Panic("
Unable to open first file: ", e)
    }
    defer f1.Close()

    // open second file
    f2, e := os.Open(os.Args[2])
    if e != nil {
        log.Panic("
Unable to open second file: ", e)
    }
    defer f2.Close()

    // create a file writer
    w, e := os.Create(outFile)
    if e != nil {
        log.Panic("
Unable to create new file: ", e)
    }
    defer w.Close()

    // wrap the file readers with CSV readers
    cr1 := csv.NewReader(f1)
    cr2 := csv.NewReader(f2)

    // wrap the out file writer with a CSV writer
    cw := csv.NewWriter(w)

    // initialize the lines
    line1, b := readline(cr1)
    if !b {
        log.Panic("
No CSV lines in file 1.")
    }
    line2, b := readline(cr2)
    if !b {
        log.Panic("
No CSV lines in file 2.")
    }

    // copy the files according to similar rules of the merge step in Mergesort
    for {
        if compare(line1, line2) {
            writeline(line1)
            if line1, b = readline(cr1); !b {
                copy(cr2, w)
                break
            }
        } else {
            writeline(line2)
            if line2, b = readline(cr2); !b {
                copy(cr1, w)
                break
            }
        }
    }

    // note the files will be closed here, since we defered it above
}

func readline(r csv.Reader) ([]string, bool) {
    line, e := r.Read()
    if e != nil {
        if e == io.EOF {
            return nil, false
        }
        log.Panic("
Error reading file: ", e)
    }
    return line, true
}

func writeline(w csv.Writer, line []string) {
    e := w.Write(line)
    if e != nil {
        log.Panic("
Error writing file: ", e)
    }
}

func copy(r csv.Reader, w csv.Writer) {
    for line, b := readline(r); !b; r, b = readline(r) {
        writeline(w, line)
    }
}

func compare(line1, line2 string) bool {
    /* here, determine if line1 and line2 are in the correct order (line1 first)
       if so, return true, otherwise false
    */
}

Note: This answer has been heavily edited to include the code inline instead of a link. Additionally, the code has been improved dramatically since my first draft, but since there's been no activity here, I'm just blowing away the old version and rewriting my answer.

If the two files are sorted individually then you can use merge function of the merge sort to combine them into a sorted array.

By array, I mean we can use another CSV file to write the sorted data on the fly.