Skip to content

Commit 8689e9b

Browse files
Merge pull request #2 from RamanjaneyuluIdavalapati/master
diskarry stabilized code
2 parents 669f6a9 + 6c5779f commit 8689e9b

File tree

9 files changed

+341
-21
lines changed

9 files changed

+341
-21
lines changed

.travis.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
language: python
2+
python:
3+
- '2.7'
4+
install:
5+
- pip install .
6+
script:
7+
- python setup.py test
8+
deploy:
9+
- provider: releases
10+
api_key:
11+
secure: KuV+GjVaFNKvhpI3rgBjolmdhtRWbnJSOgy2iDT4+GQMEz+ypXF3XGR8Opx8NDDeFoBzRxcLcfqRo0Og/i06n5IZ/GcqppErJxYGb984qAJVymukm7pUO4+tls10pDrzZH0+4tTp3SNHukUlcUFjk/+bCTrD67uCZsQGCob3aflLBNx+uL+q3TinF/gbLKdf6wLQqVzkye//ZC20zjZWLRQpyQPRAH1CgGKtRETo5BgSq9w4LbGZd0pGc3S3b33wf3MVzfVlepuXHtwEpviXlXYImRX8/giw6SIx/EJN5IZFkeyGFBetdPsN6dCcOiWAlaFAlrsUSb/YtlrNWZOizkUpmzlAmPTgpl/rW1kS2UUxjLMV1w3oaBt8bhRhX97C0SI0gO2cMWO4E2NIqUFG+rz7Y9VBb/ZpWTlaT5odU+paIBYT0ii6m79YYVu53ajyB6e26zN1Mw12fmRlzBTWsZopxVa22P1+zuIEqtN9meMu5KKONuQ7FL3iNphA8RGguj9X6NKVy4PbbO/25fGScy1oTxsAVCDsiq9x3M+tFg8+9g1fJJ1Ry30wq2cqe1L9o3AaEcuoIvBhf2cIj2ZO1NQAFr9/pkr7t4w/HfJsrRGmlK4hLFkNwZUPdufIS/1s/66lHIiaXacM069xz47zpuxNftjjF3DoZX5Ge/wjKn8=
12+
name: diskarray-0.1.1
13+
tag_name: 0.1.1
14+
on:
15+
repo: deep-compute/diskarray
16+
- provider: pypi
17+
distributions: "sdist bdist_wheel"
18+
server: https://upload.pypi.org/legacy/
19+
user: deepcompute
20+
password:
21+
secure: "XScKeAhmF7gGThEjJCtur1RHlGUZ2n3FXukp7619YHmLoUzvszo8Wg+ZSJrdl3soEuSvVQdn2G3ngxAZdKmNhWVYrUKRdKL5iY4WyYHVujkOq+diVqCGHWbSmZJupyscgt1L/H8l+IohC1dnng/ThQuFp7Jbay68lM8LzS20f16JgSL6Xq+jRqqtBU3jALoqf9scnwuXG+Yj51YTQ9DmS37ctlLyzg3GEbICQB1dNaSZ3HtM1LB0/69++rhukzicm1Z9FcNEbdL9U7ohAgrI1+0mj/4xtURIrOwvlKjhjXUxf5S2RMe49xFq4KDRkXL8uwUEvRnskwXH6u0+mkAQRpMYFMygxcmiVhuhhelJD43RuO56o84IWQLAwje/RUeH1huePUNBV63tAXQC1uQ4yoaZs/DehtjDjgIkk/j4xdTnlwaN68icGrnWxZ5QCwZkz16OQ84HolTLGG9X1Fuqs7iYGI9GmBYfSG6FVp6H6E1Cakvc492gAVqIJKln6Y5u8a1SPpHp2jhhqzsMQo/fwqDTc8m59ENX1zxcIgtJjaWrgkJ4U+W4GBosKpevTJCLKzpirsfJ2Al/7E7rTEOpXpw3NNeeyben708/RiPbWVvhE6yZ7Z6XHYzIO+fF9gr0pRiqdL3uL3ASCUlNEBvw5j7adH57foeTf8mZkQqLyrU="
22+
on:
23+
tags: true
24+
branch: master

README.md

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# DiskArray
2+
3+
A resizable and readable numpy array on disk.
4+
5+
This module is built on numpy `memmap` used for accessing and modifying small segments of large files on disk, without reading the entire file into memory.
6+
7+
This module also supports appending your numpy arrays to disk array at any time.
8+
9+
## Installation
10+
11+
> Prerequisites: Python
12+
13+
```bash
14+
$ sudo pip install diskarray
15+
```
16+
17+
## Quick Example
18+
19+
```python
20+
>>> import numpy as np
21+
>>> from diskarray import DiskArray
22+
23+
>>> data = np.array([[2 , 3, 4], [1, 2, 3]])
24+
25+
>>> da = DiskArray('/tmp/disk.array', shape=(0, 3), dtype=np.float32)
26+
27+
>>> da.extend(data)
28+
29+
>>> print(da[:])
30+
```
31+
32+
## Usage
33+
34+
`DiskArray` supports two methods, extend and append.
35+
36+
`extend` is used to append arrays to disk array.
37+
38+
`append` is used to append single array at a time.
39+
40+
### Importing
41+
42+
#### Using extend
43+
44+
Example1:
45+
46+
```python
47+
>>> import numpy as np
48+
>>> from diskarray import DiskArray
49+
50+
>>> data = np.array([[2 , 3, 4], [1, 2, 3]])
51+
52+
# creating object to disk array
53+
>>> da = DiskArray('/tmp/disk.array', shape=(0, 3), capacity=(10, 3), growby=200, dtype=np.float32)
54+
55+
# extend the data to disk array
56+
>>> da.extend(data)
57+
58+
# Get the full array
59+
>>> print(da[:])
60+
61+
# Get the data which is in first row
62+
>>> print(da[1])
63+
64+
# Get the data from first row to third row
65+
>>> print(da[1:3])
66+
67+
# Get the data which is in 1st row 1st column
68+
>>> print(da[1][1])
69+
```
70+
71+
- `/tmp/disk.array` is the file which holds disk arrays.
72+
- `shape` is the size of the disk array.
73+
- `capacity` is the total capacity of the disk array.
74+
This is used because when we want to extend arrays which are larger than `shape` then DiskArray creates again memmap to the file which is costliear operation.
75+
So we are using `capacity` to directly create disk array with the size of `capacity`
76+
77+
- `capacity` and `growby` are optional which takes `shape` as `capacity` and `growby` as `10000` when these are not given.
78+
79+
Example2:
80+
81+
```python
82+
>>> import numpy as np
83+
>>> from diskarray import DiskArray
84+
85+
>>> dtype = [('token', np.uint32), ('count', np.uint32), ('pmi', np.float32)]
86+
87+
>>> data = np.array([[(1, 0, 0.), (0, 2, 0.), (0, 2, 0.)], [(1, 0, 0.), (0, 2, 0.), (0, 2, 0.)]], dtype=dtype)
88+
89+
>>> da = DiskArray('/tmp/disk.array', shape=(0, 3), capacity=(10, 3), dtype=np.float32)
90+
91+
>>> da.extend(data)
92+
93+
# Get the full array
94+
>>> print(da[:])
95+
96+
# Get the count values at 1th row
97+
>>> print(da[1]['count'])
98+
99+
# Get the token value at 1th row 2nd column
100+
>>> print(da[1][2]['token'])
101+
102+
# Modify the pmi value at 1th row 2nd column
103+
>>> da[1][2]['pmi'] = 10.0
104+
```
105+
106+
#### Using append
107+
108+
Example:
109+
110+
```python
111+
>>> import numpy as np
112+
>>> from diskarray import DiskArray
113+
114+
>>> data = np.array([[2 , 3, 4])
115+
116+
# creating object to disk array
117+
>>> da = DiskArray('/tmp/disk.array', shape=(0, 3), capacity=(10, 3), growby=200, dtype=np.float32)
118+
119+
# append 1 dimensional array to disk array
120+
>>> da.append(data)
121+
>>> da.append(data + 1)
122+
123+
# Get the full array
124+
>>> print(da[:])
125+
126+
# Get the data which is in first row
127+
>>> print(da[1])
128+
129+
# Get the data from first row to third row
130+
>>> print(da[1:3])
131+
132+
# Get the data which is in 1st row 1st column
133+
>>> print(da[1][1])
134+
```
135+
136+
`growby` is used to increase the size of disk array when it reaches to it's maximum limit.
137+
138+
### Interactive console
139+
140+
```bash
141+
# diskarray provides command to directly interact with it
142+
143+
$ diskarray interact <fpath> <shape> <dtype> --capacity <capacity> --growby <growby> --mode <mode>
144+
145+
# <fpath> is the input file which is used to store disk arrys.
146+
# <shape> is the size of the disk array.
147+
# <dtype> is the data type of the disk array.
148+
# <capacity> is the total capacity of the disk array.
149+
# <growby> is used to increase the size of the disk array when it reaches to it's maximum limit.
150+
# <mode> is to open the disk array in that mode.
151+
```
152+
153+
Example:
154+
155+
```bash
156+
$ diskarray interact /tmp/test '(0, 3)' np.float32 --capacity '(10, 3)' --growby 5 --mode r+
157+
DiskArray Console
158+
>>> import numpy as np
159+
>>> da.append(np.array([1, 2, 3]))
160+
```
161+
162+
## Running Tests
163+
164+
```
165+
$ python setup.py test
166+
```

diskarray/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
from .command import main
12
from .diskarray import DiskArray

diskarray/command.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import code
2+
3+
from basescript import BaseScript
4+
import numpy as np
5+
6+
from .diskarray import DiskArray
7+
8+
class DiskArrayCommand(BaseScript):
9+
DESC = 'DiskArray command-line tool'
10+
11+
DEFAULT_CAPACITY = None
12+
DEFAULT_GROWBY = 10000
13+
DEFAULT_MODE = 'r+'
14+
15+
def interact(self):
16+
if self.args.capacity:
17+
capacity = eval(self.args.capacity)
18+
else:
19+
capacity = self.args.capacity
20+
21+
fpath = self.args.fpath
22+
shape = eval(self.args.shape)
23+
growby = self.args.growby
24+
dtype = eval(self.args.dtype)
25+
mode = self.args.mode
26+
27+
interact = DiskArray(fpath=fpath,
28+
shape=shape,
29+
capacity=capacity,
30+
growby=growby,
31+
dtype=dtype,
32+
mode=mode)
33+
34+
namespace=dict(da=interact)
35+
code.interact("DiskArray Console", local=namespace)
36+
37+
def define_subcommands(self, subcommands):
38+
super(DiskArrayCommand, self).define_subcommands(subcommands)
39+
40+
interact_cmd = subcommands.add_parser('interact',
41+
help='DiskArray Console')
42+
interact_cmd.set_defaults(func=self.interact)
43+
interact_cmd.add_argument('fpath',
44+
help='Input file which is used to store disk arrys.\
45+
eg: /tmp/disk.array')
46+
interact_cmd.add_argument('shape',
47+
help='shape is the size of the disk array.\
48+
eg: \'(0, 3)\'')
49+
interact_cmd.add_argument('dtype',
50+
help='data type of the disk array.\
51+
eg: np.float32')
52+
interact_cmd.add_argument('-c', '--capacity',
53+
default=self.DEFAULT_CAPACITY, type=str,
54+
help='capacity is the total capacity of the disk array.\
55+
This is optional and default is shape value\
56+
eg: --capacity \'(10, 3)\'')
57+
interact_cmd.add_argument('-g', '--growby',
58+
default=self.DEFAULT_GROWBY, type=int,
59+
help='growby is used to increase the size of\
60+
the disk array when it reaches to it\'s maximum limit.\
61+
This is optional and default is 10000\
62+
eg: --growby 200')
63+
interact_cmd.add_argument('-m', '--mode',
64+
default=self.DEFAULT_MODE, type=str,
65+
help='mode is to open the disk array in that mode.\
66+
Example modes are r+, r, w+ and c\
67+
This is optional and default is r+')
68+
69+
def main():
70+
DiskArrayCommand().start()
71+
72+
if __name__ == '__main__':
73+
main()

diskarray/diskarray.py

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,23 @@
11
import os
2+
import sys
3+
from functools import reduce
4+
from logging import Logger
25

36
import numpy as np
4-
from deeputil import Dummy
57

6-
DUMMY_LOG = Dummy()
8+
from .exception import AppendNotSupported
79

810
class DiskArray(object):
911
GROWBY = 10000
1012

11-
def __init__(self, fpath, shape, dtype, mode='r+',
12-
capacity=None, growby=GROWBY, log=DUMMY_LOG):
13+
def __init__(self, fpath, dtype, mode='r+', shape=None,
14+
capacity=None, growby=GROWBY, log=Logger):
15+
'''
16+
>>> import numpy as np
17+
>>> da = DiskArray('/tmp/test.array', shape=(0, 3), dtype=np.float32)
18+
>>> print(da[:])
19+
[]
20+
'''
1321

1422
self._fpath = fpath
1523
self._shape = shape
@@ -41,13 +49,13 @@ def flush(self):
4149
self.data.flush()
4250
self._truncate_if_needed()
4351

44-
def _shape_bytes(self, shape):
52+
def _shape_bytes(self, shape, dtype_bytes):
4553
return reduce((lambda x, y: x * y), shape) * dtype_bytes
4654

4755
def _truncate_if_needed(self):
4856
fd = os.open(self._fpath, os.O_RDWR|os.O_CREAT)
4957
dtype_bytes = np.dtype(self._dtype).itemsize
50-
nbytes = self._shape_bytes(self._shape)
58+
nbytes = self._shape_bytes(self._shape, dtype_bytes)
5159
os.ftruncate(fd, nbytes)
5260
self._create_ndarray()
5361

@@ -78,6 +86,19 @@ def _incr_shape(self, shape, n):
7886
return tuple(_s)
7987

8088
def append(self, v):
89+
'''
90+
>>> import numpy as np
91+
>>> da = DiskArray('/tmp/test.array', shape=(0, 3), growby=3, dtype=np.float32)
92+
>>> print(da[:])
93+
[]
94+
>>> data = np.array([[2,3,4], [1, 2, 3]])
95+
>>> da.append(data[0])
96+
>>> print(da[:])
97+
[[ 2. 3. 4.]
98+
[ 0. 0. 0.]
99+
[ 0. 0. 0.]]
100+
'''
101+
81102
# FIXME: for now we only support
82103
# append along axis 0 and only
83104
# for 1d and 2d arrays
@@ -101,6 +122,36 @@ def append(self, v):
101122
self._shape = self._incr_shape(self._shape, 1)
102123

103124
def extend(self, v):
125+
'''
126+
>>> import numpy as np
127+
>>> da = DiskArray('/tmp/test.array', shape=(0, 3), capacity=(10, 3), dtype=np.float32)
128+
>>> print(da[:])
129+
[[ 2. 3. 4.]
130+
[ 0. 0. 0.]
131+
[ 0. 0. 0.]
132+
[ 0. 0. 0.]
133+
[ 0. 0. 0.]
134+
[ 0. 0. 0.]
135+
[ 0. 0. 0.]
136+
[ 0. 0. 0.]
137+
[ 0. 0. 0.]
138+
[ 0. 0. 0.]]
139+
>>> data = np.array([[2,3,4], [1, 2, 3]])
140+
>>> da.extend(data)
141+
>>> print(da[:])
142+
[[ 2. 3. 4.]
143+
[ 1. 2. 3.]
144+
[ 0. 0. 0.]
145+
[ 0. 0. 0.]
146+
[ 0. 0. 0.]
147+
[ 0. 0. 0.]
148+
[ 0. 0. 0.]
149+
[ 0. 0. 0.]
150+
[ 0. 0. 0.]
151+
[ 0. 0. 0.]]
152+
>>> os.remove('/tmp/test.array')
153+
'''
154+
104155
nrows = self._shape[0]
105156
nrows_capacity = self._capacity_shape[0]
106157
remaining_capacity = nrows_capacity - nrows

example.py

Lines changed: 0 additions & 9 deletions
This file was deleted.

0 commit comments

Comments
 (0)