I am creating a utility that needs to be aware of all the datasets/tables that exist in my BigQuery project. My current code for getting this information is as follows (using Go API):
func populateExistingTableMap(service *bigquery.Service, cloudCtx context.Context, projectId string) (map[string]map[string]bool, error) {
tableMap := map[string]map[string]bool{}
call := service.Datasets.List(projectId)
//call.Fields("datasets/datasetReference")
if err := call.Pages(cloudCtx, func(page *bigquery.DatasetList) error {
for _, v := range page.Datasets {
if tableMap[v.DatasetReference.DatasetId] == nil {
tableMap[v.DatasetReference.DatasetId] = map[string]bool{}
}
table_call := service.Tables.List(projectId, v.DatasetReference.DatasetId)
//table_call.Fields("tables/tableReference")
if err := table_call.Pages(cloudCtx, func(page *bigquery.TableList) error {
for _, t := range page.Tables {
tableMap[v.DatasetReference.DatasetId][t.TableReference.TableId] = true
}
return nil
}); err != nil {
return errors.New("Error Parsing Table")
}
}
return nil
}); err != nil {
return tableMap, err
}
return tableMap, nil
}
For a project with about 5000 datasets, each with up to 10 tables, this code takes almost 15 minutes to return. Is there a faster way to iterate through the names of all existing datasets/tables? I have tried using the Fields method to return only the fields I need (you can see those lines commented out above), but that results in only 50 (exactly 50) of my datasets being returned.
Any ideas?
Here is an updated version of my code, with concurrency, that reduced the processing time from about 15 minutes to 3 minutes.
func populateExistingTableMap(service *bigquery.Service, cloudCtx context.Context, projectId string) (map[string]map[string]bool, error) {
tableMap = map[string]map[string]bool{}
call := service.Datasets.List(projectId)
//call.Fields("datasets/datasetReference")
if err := call.Pages(cloudCtx, func(page *bigquery.DatasetList) error {
var wg sync.WaitGroup
wg.Add(len(page.Datasets))
for _, v := range page.Datasets {
if tableMap[v.DatasetReference.DatasetId] == nil {
tableMap[v.DatasetReference.DatasetId] = map[string]bool{}
}
go func(service *bigquery.Service, datasetID string, projectId string) {
defer wg.Done()
table_call := service.Tables.List(projectId, datasetID)
//table_call.Fields("tables/tableReference")
if err := table_call.Pages(cloudCtx, func(page *bigquery.TableList) error {
for _, t := range page.Tables {
tableMap[datasetID][t.TableReference.TableId] = true
}
return nil // NOTE: returning a non-nil error stops pagination.
}); err != nil {
// TODO: Handle error.
fmt.Println(err)
}
}(service, v.DatasetReference.DatasetId, projectId)
}
wg.Wait()
return nil // NOTE: returning a non-nil error stops pagination.
}); err != nil {
return tableMap, err
// TODO: Handle error.
}
return tableMap, nil
}