I have a list of arbitrary length, and I need to split it up into equal size chunks and operate on it. There are some obvious ways to do this, like keeping a counter and two lists, and when the second list fills up, add it to the first list and empty the second list for the next round of data, but this is potentially extremely expensive.
I was wondering if anyone had a good solution to this for lists of any length, e.g. using generators.
I was looking for something useful in itertools
but I couldn't find anything obviously useful. Might've missed it, though.
Related question: What is the most “pythonic” way to iterate over a list in chunks?
转载于:https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
Here's a generator that yields the chunks you want:
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
import pprint
pprint.pprint(list(chunks(range(10, 75), 10)))
[[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
[70, 71, 72, 73, 74]]
If you're using Python 2, you should use xrange()
instead of range()
:
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in xrange(0, len(l), n):
yield l[i:i + n]
Also you can simply use list comprehension instead of writing a function. Python 3:
[l[i:i + n] for i in range(0, len(l), n)]
Python 2 version:
[l[i:i + n] for i in xrange(0, len(l), n)]
If you know list size:
def SplitList(list, chunk_size):
return [list[offs:offs+chunk_size] for offs in range(0, len(list), chunk_size)]
If you don't (an iterator):
def IterChunks(sequence, chunk_size):
res = []
for item in sequence:
res.append(item)
if len(res) >= chunk_size:
yield res
res = []
if res:
yield res # yield the last, incomplete, portion
In the latter case, it can be rephrased in a more beautiful way if you can be sure that the sequence always contains a whole number of chunks of given size (i.e. there is no incomplete last chunk).
Here is a generator that work on arbitrary iterables:
def split_seq(iterable, size):
it = iter(iterable)
item = list(itertools.islice(it, size))
while item:
yield item
item = list(itertools.islice(it, size))
Example:
>>> import pprint
>>> pprint.pprint(list(split_seq(xrange(75), 10)))
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
[40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
[50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
[70, 71, 72, 73, 74]]
heh, one line version
In [48]: chunk = lambda ulist, step: map(lambda i: ulist[i:i+step], xrange(0, len(ulist), step))
In [49]: chunk(range(1,100), 10)
Out[49]:
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
[31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
[51, 52, 53, 54, 55, 56, 57, 58, 59, 60],
[61, 62, 63, 64, 65, 66, 67, 68, 69, 70],
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80],
[81, 82, 83, 84, 85, 86, 87, 88, 89, 90],
[91, 92, 93, 94, 95, 96, 97, 98, 99]]
Directly from the (old) Python documentation (recipes for itertools):
from itertools import izip, chain, repeat
def grouper(n, iterable, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return izip(*[chain(iterable, repeat(padvalue, n-1))]*n)
The current version, as suggested by J.F.Sebastian:
#from itertools import izip_longest as zip_longest # for Python 2.x
from itertools import zip_longest # for Python 3.x
#from six.moves import zip_longest # for both (uses the six compat library)
def grouper(n, iterable, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
I guess Guido's time machine works—worked—will work—will have worked—was working again.
These solutions work because [iter(iterable)]*n
(or the equivalent in the earlier version) creates one iterator, repeated n
times in the list. izip_longest
then effectively performs a round-robin of "each" iterator; because this is the same iterator, it is advanced by each such call, resulting in each such zip-roundrobin generating one tuple of n
items.
def split_seq(seq, num_pieces):
start = 0
for i in xrange(num_pieces):
stop = start + len(seq[i::num_pieces])
yield seq[start:stop]
start = stop
usage:
seq = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for seq in split_seq(seq, 3):
print seq
If you want something super simple:
def chunks(l, n):
n = max(1, n)
return (l[i:i+n] for i in xrange(0, len(l), n))
Without calling len() which is good for large lists:
def splitter(l, n):
i = 0
chunk = l[:n]
while chunk:
yield chunk
i += n
chunk = l[i:i+n]
And this is for iterables:
def isplitter(l, n):
l = iter(l)
chunk = list(islice(l, n))
while chunk:
yield chunk
chunk = list(islice(l, n))
The functional flavour of the above:
def isplitter2(l, n):
return takewhile(bool,
(tuple(islice(start, n))
for start in repeat(iter(l))))
OR:
def chunks_gen_sentinel(n, seq):
continuous_slices = imap(islice, repeat(iter(seq)), repeat(0), repeat(n))
return iter(imap(tuple, continuous_slices).next,())
OR:
def chunks_gen_filter(n, seq):
continuous_slices = imap(islice, repeat(iter(seq)), repeat(0), repeat(n))
return takewhile(bool,imap(tuple, continuous_slices))
def chunk(input, size):
return map(None, *([iter(input)] * size))
Simple yet elegant
l = range(1, 1000)
print [l[x:x+10] for x in xrange(0, len(l), 10)]
or if you prefer:
chunks = lambda l, n: [l[x: x+n] for x in xrange(0, len(l), n)]
chunks(l, 10)
If you had a chunk size of 3 for example, you could do:
zip(*[iterable[i::3] for i in range(3)])
source: http://code.activestate.com/recipes/303060-group-a-list-into-sequential-n-tuples/
I would use this when my chunk size is fixed number I can type, e.g. '3', and would never change.
Consider using matplotlib.cbook pieces
for example:
import matplotlib.cbook as cbook
segments = cbook.pieces(np.arange(20), 3)
for s in segments:
print s
def chunks(iterable,n):
"""assumes n is an integer>0
"""
iterable=iter(iterable)
while True:
result=[]
for i in range(n):
try:
a=next(iterable)
except StopIteration:
break
else:
result.append(a)
if result:
yield result
else:
break
g1=(i*i for i in range(10))
g2=chunks(g1,3)
print g2
'<generator object chunks at 0x0337B9B8>'
print list(g2)
'[[0, 1, 4], [9, 16, 25], [36, 49, 64], [81]]'
I realise this question is old (stumbled over it on Google), but surely something like the following is far simpler and clearer than any of the huge complex suggestions and only uses slicing:
def chunker(iterable, chunksize):
for i,c in enumerate(iterable[::chunksize]):
yield iterable[i*chunksize:(i+1)*chunksize]
>>> for chunk in chunker(range(0,100), 10):
... print list(chunk)
...
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
... etc ...
A generator expression:
def chunks(seq, n):
return (seq[i:i+n] for i in xrange(0, len(seq), n))
eg.
print list(chunks(range(1, 1000), 10))
I know this is kind of old but I don't why nobody mentioned numpy.array_split
:
lst = range(50)
In [26]: np.array_split(lst,5)
Out[26]:
[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]),
array([20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39]),
array([40, 41, 42, 43, 44, 45, 46, 47, 48, 49])]
I like the Python doc's version proposed by tzot and J.F.Sebastian a lot, but it has two shortcomings:
I'm using this one a lot in my code:
from itertools import islice
def chunks(n, iterable):
iterable = iter(iterable)
while True:
yield tuple(islice(iterable, n)) or iterable.next()
UPDATE: A lazy chunks version:
from itertools import chain, islice
def chunks(n, iterable):
iterable = iter(iterable)
while True:
yield chain([next(iterable)], islice(iterable, n-1))
The toolz library has the partition
function for this:
from toolz.itertoolz.core import partition
list(partition(2, [1, 2, 3, 4]))
[(1, 2), (3, 4)]
None of these answers are evenly sized chunks, they all leave a runt chunk at the end, so they're not completely balanced. If you were using these functions to distribute work, you've built-in the prospect of one likely finishing well before the others, so it would sit around doing nothing while the others continued working hard.
For example, the current top answer ends with:
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
[70, 71, 72, 73, 74]]
I just hate that runt at the end!
Others, like list(grouper(3, xrange(7)))
, and chunk(xrange(7), 3)
both return: [(0, 1, 2), (3, 4, 5), (6, None, None)]
. The None
's are just padding, and rather inelegant in my opinion. They are NOT evenly chunking the iterables.
Why can't we divide these better?
Here's a balanced solution, adapted from a function I've used in production (Note in Python 3 to replace xrange
with range
):
def baskets_from(items, maxbaskets=25):
baskets = [[] for _ in xrange(maxbaskets)] # in Python 3 use range
for i, item in enumerate(items):
baskets[i % maxbaskets].append(item)
return filter(None, baskets)
And I created a generator that does the same if you put it into a list:
def iter_baskets_from(items, maxbaskets=3):
'''generates evenly balanced baskets from indexable iterable'''
item_count = len(items)
baskets = min(item_count, maxbaskets)
for x_i in xrange(baskets):
yield [items[y_i] for y_i in xrange(x_i, item_count, baskets)]
And finally, since I see that all of the above functions return elements in a contiguous order (as they were given):
def iter_baskets_contiguous(items, maxbaskets=3, item_count=None):
'''
generates balanced baskets from iterable, contiguous contents
provide item_count if providing a iterator that doesn't support len()
'''
item_count = item_count or len(items)
baskets = min(item_count, maxbaskets)
items = iter(items)
floor = item_count // baskets
ceiling = floor + 1
stepdown = item_count % baskets
for x_i in xrange(baskets):
length = ceiling if x_i < stepdown else floor
yield [items.next() for _ in xrange(length)]
To test them out:
print(baskets_from(xrange(6), 8))
print(list(iter_baskets_from(xrange(6), 8)))
print(list(iter_baskets_contiguous(xrange(6), 8)))
print(baskets_from(xrange(22), 8))
print(list(iter_baskets_from(xrange(22), 8)))
print(list(iter_baskets_contiguous(xrange(22), 8)))
print(baskets_from('ABCDEFG', 3))
print(list(iter_baskets_from('ABCDEFG', 3)))
print(list(iter_baskets_contiguous('ABCDEFG', 3)))
print(baskets_from(xrange(26), 5))
print(list(iter_baskets_from(xrange(26), 5)))
print(list(iter_baskets_contiguous(xrange(26), 5)))
Which prints out:
[[0], [1], [2], [3], [4], [5]]
[[0], [1], [2], [3], [4], [5]]
[[0], [1], [2], [3], [4], [5]]
[[0, 8, 16], [1, 9, 17], [2, 10, 18], [3, 11, 19], [4, 12, 20], [5, 13, 21], [6, 14], [7, 15]]
[[0, 8, 16], [1, 9, 17], [2, 10, 18], [3, 11, 19], [4, 12, 20], [5, 13, 21], [6, 14], [7, 15]]
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14], [15, 16, 17], [18, 19], [20, 21]]
[['A', 'D', 'G'], ['B', 'E'], ['C', 'F']]
[['A', 'D', 'G'], ['B', 'E'], ['C', 'F']]
[['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]
[[0, 5, 10, 15, 20, 25], [1, 6, 11, 16, 21], [2, 7, 12, 17, 22], [3, 8, 13, 18, 23], [4, 9, 14, 19, 24]]
[[0, 5, 10, 15, 20, 25], [1, 6, 11, 16, 21], [2, 7, 12, 17, 22], [3, 8, 13, 18, 23], [4, 9, 14, 19, 24]]
[[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20], [21, 22, 23, 24, 25]]
Notice that the contiguous generator provide chunks in the same length patterns as the other two, but the items are all in order, and they are as evenly divided as one may divide a list of discrete elements.
I'm surprised nobody has thought of using iter
's two-argument form:
from itertools import islice
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
Demo:
>>> list(chunk(range(14), 3))
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 13)]
This works with any iterable and produces output lazily. It returns tuples rather than iterators, but I think it has a certain elegance nonetheless. It also doesn't pad; if you want padding, a simple variation on the above will suffice:
from itertools import islice, chain, repeat
def chunk_pad(it, size, padval=None):
it = chain(iter(it), repeat(padval))
return iter(lambda: tuple(islice(it, size)), (padval,) * size)
Demo:
>>> list(chunk_pad(range(14), 3))
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 13, None)]
>>> list(chunk_pad(range(14), 3, 'a'))
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 13, 'a')]
Like the izip_longest
-based solutions, the above always pads. As far as I know, there's no one- or two-line itertools recipe for a function that optionally pads. By combining the above two approaches, this one comes pretty close:
_no_padding = object()
def chunk(it, size, padval=_no_padding):
if padval == _no_padding:
it = iter(it)
sentinel = ()
else:
it = chain(iter(it), repeat(padval))
sentinel = (padval,) * size
return iter(lambda: tuple(islice(it, size)), sentinel)
Demo:
>>> list(chunk(range(14), 3))
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 13)]
>>> list(chunk(range(14), 3, None))
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 13, None)]
>>> list(chunk(range(14), 3, 'a'))
[(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11), (12, 13, 'a')]
I believe this is the shortest chunker proposed that offers optional padding.
Another more explicit version.
def chunkList(initialList, chunkSize):
"""
This function chunks a list into sub lists
that have a length equals to chunkSize.
Example:
lst = [3, 4, 9, 7, 1, 1, 2, 3]
print(chunkList(lst, 3))
returns
[[3, 4, 9], [7, 1, 1], [2, 3]]
"""
finalList = []
for i in range(0, len(initialList), chunkSize):
finalList.append(initialList[i:i+chunkSize])
return finalList
I saw the most awesome Python-ish answer in a duplicate of this question:
from itertools import zip_longest
a = range(1, 16)
i = iter(a)
r = list(zip_longest(i, i, i))
>>> print(r)
[(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12), (13, 14, 15)]
You can create n-tuple for any n. If a = range(1, 15)
, then the result will be:
[(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12), (13, 14, None)]
If the list is divided evenly, then you can replace zip_longest
with zip
, otherwise the triplet (13, 14, None)
would be lost. Python 3 is used above. For Python 2, use izip_longest
.
code:
def split_list(the_list, chunk_size):
result_list = []
while the_list:
result_list.append(the_list[:chunk_size])
the_list = the_list[chunk_size:]
return result_list
a_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print split_list(a_list, 3)
result:
[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]
a = [1, 2, 3, 4, 5, 6, 7, 8, 9]
CHUNK = 4
[a[i*CHUNK:(i+1)*CHUNK] for i in xrange((len(a) + CHUNK - 1) / CHUNK )]
At this point, I think we need a recursive generator, just in case...
In python 2:
def chunks(li, n):
if li == []:
return
yield li[:n]
for e in chunks(li[n:], n):
yield e
In python 3:
def chunks(li, n):
if li == []:
return
yield li[:n]
yield from chunks(li[n:], n)
Also, in case of massive Alien invasion, a decorated recursive generator might become handy:
def dec(gen):
def new_gen(li, n):
for e in gen(li, n):
if e == []:
return
yield e
return new_gen
@dec
def chunks(li, n):
yield li[:n]
for e in chunks(li[n:], n):
yield e
At this point, I think we need the obligatory anonymous-recursive function.
Y = lambda f: (lambda x: x(x))(lambda y: f(lambda *args: y(y)(*args)))
chunks = Y(lambda f: lambda n: [n[0][:n[1]]] + f((n[0][n[1]:], n[1])) if len(n[0]) > 0 else [])
[AA[i:i+SS] for i in range(len(AA))[::SS]]
Where AA is array, SS is chunk size. For example:
>>> AA=range(10,21);SS=3
>>> [AA[i:i+SS] for i in range(len(AA))[::SS]]
[[10, 11, 12], [13, 14, 15], [16, 17, 18], [19, 20]]
# or [range(10, 13), range(13, 16), range(16, 19), range(19, 21)] in py3
You may also use get_chunks
function of utilspie
library as:
>>> from utilspie import iterutils
>>> a = [1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> list(iterutils.get_chunks(a, 5))
[[1, 2, 3, 4, 5], [6, 7, 8, 9]]
You can install utilspie
via pip:
sudo pip install utilspie
Disclaimer: I am the creator of utilspie library.
One more solution
def make_chunks(data, chunk_size):
while data:
chunk, data = data[:chunk_size], data[chunk_size:]
yield chunk
>>> for chunk in make_chunks([1, 2, 3, 4, 5, 6, 7], 2):
... print chunk
...
[1, 2]
[3, 4]
[5, 6]
[7]
>>>
I was curious about the performance of different approaches and here it is:
Tested on Python 3.5.1
import time
batch_size = 7
arr_len = 298937
#---------slice-------------
print("\r\nslice")
start = time.time()
arr = [i for i in range(0, arr_len)]
while True:
if not arr:
break
tmp = arr[0:batch_size]
arr = arr[batch_size:-1]
print(time.time() - start)
#-----------index-----------
print("\r\nindex")
arr = [i for i in range(0, arr_len)]
start = time.time()
for i in range(0, round(len(arr) / batch_size + 1)):
tmp = arr[batch_size * i : batch_size * (i + 1)]
print(time.time() - start)
#----------batches 1------------
def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
print("\r\nbatches 1")
arr = [i for i in range(0, arr_len)]
start = time.time()
for x in batch(arr, batch_size):
tmp = x
print(time.time() - start)
#----------batches 2------------
from itertools import islice, chain
def batch(iterable, size):
sourceiter = iter(iterable)
while True:
batchiter = islice(sourceiter, size)
yield chain([next(batchiter)], batchiter)
print("\r\nbatches 2")
arr = [i for i in range(0, arr_len)]
start = time.time()
for x in batch(arr, batch_size):
tmp = x
print(time.time() - start)
#---------chunks-------------
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
print("\r\nchunks")
arr = [i for i in range(0, arr_len)]
start = time.time()
for x in chunks(arr, batch_size):
tmp = x
print(time.time() - start)
#-----------grouper-----------
from itertools import zip_longest # for Python 3.x
#from six.moves import zip_longest # for both (uses the six compat library)
def grouper(iterable, n, padvalue=None):
"grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)
arr = [i for i in range(0, arr_len)]
print("\r\ngrouper")
start = time.time()
for x in grouper(arr, batch_size):
tmp = x
print(time.time() - start)
Results:
slice
31.18285083770752
index
0.02184295654296875
batches 1
0.03503894805908203
batches 2
0.22681021690368652
chunks
0.019841909408569336
grouper
0.006506919860839844