GoLang在字节片的第N行获取字符串

In a personal project I am implementing a function that returns a random line from a long file. For it to work I have to create a function that returns a string at line N, a second function that creates a random number between 0 and lines in file. While I was implementing those I figured it may be more efficient to store the data in byte slices by default, rather than storing them in separate files, which have to be read at run time.

Question: How would I go about implementing a function that returns a string at a random line of the []byte representation of my file.

My function for getting a string from a file:

func atLine(n int) (s string) {
    f, err := os.Open("./path/to/file")
    if err != nil {
        panic("Could not read file.")
    }
    defer f.Close()
    r := bufio.NewReader(f)
    for i := 1; ; i++ {
        line, _, err := r.ReadLine()
        if err != nil {
            break
        }
        if i == n {
            s = string(line[:])
            break
        }
    }
    return s
}

Additional info:

  • Lines are not longer than 50 characters at most
  • Lines have no special characters (although a solution handling those is welcome)
  • Number of lines in the files is known and so the same can be applied for []byte

Dealing with just the question part (and not the sanity of this) - you have a []byte and want to get a specific string line from it - the bytes.Reader has no ReadLine method which you will have already noticed.

You can pass a bytes reader to bufio.NewReader, and gain the ReadLine functionality you are trying to access.

bytesReader := bytes.NewReader([]byte("test1
test2
test3
"))
bufReader := bufio.NewReader(bytesReader)
value1, _, _ := bufReader.ReadLine()
value2, _, _ := bufReader.ReadLine()
value3, _, _ := bufReader.ReadLine()
fmt.Println(string(value1))
fmt.Println(string(value2))
fmt.Println(string(value3))

Obviously it is not sensible to ignore the errors, but for the purpose of brevity I do it here.

https://play.golang.org/p/fRQUfmZQke

Results:

test1
test2
test3

From here, it is straight forward to fit back into your existing code.

Here is an example of fast (in the order of nanoseconds) random access to lines of text as byte data. The data is buffered and indexed in memory.

lines.go:

package main

import (
    "bytes"
    "fmt"
    "io/ioutil"
    "os"
)

type Lines struct {
    data  []byte
    index []int // line start, end pairs for data[start:end]
}

func NewLines(data []byte, nLines int) *Lines {
    bom := []byte{0xEF, 0xBB, 0xBF}
    if bytes.HasPrefix(data, bom) {
        data = data[len(bom):]
    }
    lines := Lines{data: data, index: make([]int, 0, 2*nLines)}
    for i := 0; ; {
        j := bytes.IndexByte(lines.data[i:], '
')
        if j < 0 {
            if len(lines.data[i:]) > 0 {
                lines.index = append(lines.index, i)
                lines.index = append(lines.index, len(lines.data))
            }
            break
        }
        lines.index = append(lines.index, i)
        j += i
        i = j + 1
        if j > 0 && lines.data[j-1] == '' {
            j--
        }
        lines.index = append(lines.index, j)
    }
    if len(lines.index) != cap(lines.index) {
        lines.index = append([]int(nil), lines.index...)
    }
    return &lines
}

func (l *Lines) N() int {
    return len(l.index) / 2
}

func (l *Lines) At(n int) (string, error) {
    if 1 > n || n > l.N() {
        err := fmt.Errorf(
            "data has %d lines: at %d out of range",
            l.N(), n,
        )
        return "", err
    }
    m := 2 * (n - 1)
    return string(l.data[l.index[m]:l.index[m+1]]), nil
}

var (
    // The Complete Works of William Shakespeare
    // http://www.gutenberg.org/cache/epub/100/pg100.txt
    fName  = `/home/peter/shakespeare.pg100.txt`
    nLines = 124787
)

func main() {
    data, err := ioutil.ReadFile(fName)
    if err != nil {
        fmt.Fprintln(os.Stderr, err)
        return
    }

    lines := NewLines(data, nLines)

    for _, at := range []int{1 - 1, 1, 2, 12, 42, 124754, lines.N(), lines.N() + 1} {
        line, err := lines.At(at)
        if err != nil {
            fmt.Fprintf(os.Stderr, "%d\t%v
", at, err)
            continue
        }
        fmt.Printf("%d\t%q
", at, line)
    }
}

Output:

0       data has 124787 lines: at 0 out of range
1       "The Project Gutenberg EBook of The Complete Works of William Shakespeare, by"
2       "William Shakespeare"
12      "Title: The Complete Works of William Shakespeare"
42      "SHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS"
124754  "http://www.gutenberg.org"
124787  "*** END: FULL LICENSE ***"
124788  data has 124787 lines: at 124788 out of range

lines_test.go:

package main

import (
    "io/ioutil"
    "math/rand"
    "testing"
)

func benchData(b *testing.B) []byte {
    data, err := ioutil.ReadFile(fName)
    if err != nil {
        b.Fatal(err)
    }
    return data
}

func BenchmarkNewLines(b *testing.B) {
    data := benchData(b)
    b.ReportAllocs()
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        lines := NewLines(data, nLines)
        _ = lines
    }
}

func BenchmarkLineAt(b *testing.B) {
    data := benchData(b)
    lines := NewLines(data, nLines)
    ats := make([]int, 4*1024)
    ats[0], ats[1] = 1, lines.N()
    rand.Seed(42)
    for i := range ats[2:] {
        ats[2+i] = 1 + rand.Intn(lines.N())
    }
    b.ReportAllocs()
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        at := ats[i%len(ats)]
        line, err := lines.At(at)
        if err != nil {
            b.Error(err)
        }
        _ = line
    }
}

Output

$ go test -bench=. lines.go lines_test.go
BenchmarkNewLines-8       1000   1898347 ns/op     1998898 B/op   2 allocs/op
BenchmarkLineAt-8     50000000        45.1 ns/op        49 B/op   0 allocs/op