In a personal project I am implementing a function that returns a random line from a long file. For it to work I have to create a function that returns a string at line N, a second function that creates a random number between 0 and lines in file. While I was implementing those I figured it may be more efficient to store the data in byte slices by default, rather than storing them in separate files, which have to be read at run time.
Question: How would I go about implementing a function that returns a string at a random line of the []byte
representation of my file.
My function for getting a string from a file:
func atLine(n int) (s string) {
f, err := os.Open("./path/to/file")
if err != nil {
panic("Could not read file.")
}
defer f.Close()
r := bufio.NewReader(f)
for i := 1; ; i++ {
line, _, err := r.ReadLine()
if err != nil {
break
}
if i == n {
s = string(line[:])
break
}
}
return s
}
Additional info:
[]byte
Dealing with just the question part (and not the sanity of this) - you have a []byte
and want to get a specific string line from it - the bytes.Reader
has no ReadLine
method which you will have already noticed.
You can pass a bytes reader to bufio.NewReader
, and gain the ReadLine
functionality you are trying to access.
bytesReader := bytes.NewReader([]byte("test1
test2
test3
"))
bufReader := bufio.NewReader(bytesReader)
value1, _, _ := bufReader.ReadLine()
value2, _, _ := bufReader.ReadLine()
value3, _, _ := bufReader.ReadLine()
fmt.Println(string(value1))
fmt.Println(string(value2))
fmt.Println(string(value3))
Obviously it is not sensible to ignore the errors, but for the purpose of brevity I do it here.
https://play.golang.org/p/fRQUfmZQke
Results:
test1
test2
test3
From here, it is straight forward to fit back into your existing code.
Here is an example of fast (in the order of nanoseconds) random access to lines of text as byte data. The data is buffered and indexed in memory.
lines.go
:
package main
import (
"bytes"
"fmt"
"io/ioutil"
"os"
)
type Lines struct {
data []byte
index []int // line start, end pairs for data[start:end]
}
func NewLines(data []byte, nLines int) *Lines {
bom := []byte{0xEF, 0xBB, 0xBF}
if bytes.HasPrefix(data, bom) {
data = data[len(bom):]
}
lines := Lines{data: data, index: make([]int, 0, 2*nLines)}
for i := 0; ; {
j := bytes.IndexByte(lines.data[i:], '
')
if j < 0 {
if len(lines.data[i:]) > 0 {
lines.index = append(lines.index, i)
lines.index = append(lines.index, len(lines.data))
}
break
}
lines.index = append(lines.index, i)
j += i
i = j + 1
if j > 0 && lines.data[j-1] == '' {
j--
}
lines.index = append(lines.index, j)
}
if len(lines.index) != cap(lines.index) {
lines.index = append([]int(nil), lines.index...)
}
return &lines
}
func (l *Lines) N() int {
return len(l.index) / 2
}
func (l *Lines) At(n int) (string, error) {
if 1 > n || n > l.N() {
err := fmt.Errorf(
"data has %d lines: at %d out of range",
l.N(), n,
)
return "", err
}
m := 2 * (n - 1)
return string(l.data[l.index[m]:l.index[m+1]]), nil
}
var (
// The Complete Works of William Shakespeare
// http://www.gutenberg.org/cache/epub/100/pg100.txt
fName = `/home/peter/shakespeare.pg100.txt`
nLines = 124787
)
func main() {
data, err := ioutil.ReadFile(fName)
if err != nil {
fmt.Fprintln(os.Stderr, err)
return
}
lines := NewLines(data, nLines)
for _, at := range []int{1 - 1, 1, 2, 12, 42, 124754, lines.N(), lines.N() + 1} {
line, err := lines.At(at)
if err != nil {
fmt.Fprintf(os.Stderr, "%d\t%v
", at, err)
continue
}
fmt.Printf("%d\t%q
", at, line)
}
}
Output:
0 data has 124787 lines: at 0 out of range
1 "The Project Gutenberg EBook of The Complete Works of William Shakespeare, by"
2 "William Shakespeare"
12 "Title: The Complete Works of William Shakespeare"
42 "SHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS"
124754 "http://www.gutenberg.org"
124787 "*** END: FULL LICENSE ***"
124788 data has 124787 lines: at 124788 out of range
lines_test.go
:
package main
import (
"io/ioutil"
"math/rand"
"testing"
)
func benchData(b *testing.B) []byte {
data, err := ioutil.ReadFile(fName)
if err != nil {
b.Fatal(err)
}
return data
}
func BenchmarkNewLines(b *testing.B) {
data := benchData(b)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
lines := NewLines(data, nLines)
_ = lines
}
}
func BenchmarkLineAt(b *testing.B) {
data := benchData(b)
lines := NewLines(data, nLines)
ats := make([]int, 4*1024)
ats[0], ats[1] = 1, lines.N()
rand.Seed(42)
for i := range ats[2:] {
ats[2+i] = 1 + rand.Intn(lines.N())
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
at := ats[i%len(ats)]
line, err := lines.At(at)
if err != nil {
b.Error(err)
}
_ = line
}
}
Output
$ go test -bench=. lines.go lines_test.go
BenchmarkNewLines-8 1000 1898347 ns/op 1998898 B/op 2 allocs/op
BenchmarkLineAt-8 50000000 45.1 ns/op 49 B/op 0 allocs/op