It might not work in Excel but with Python you can always use a generator to stream the contents of a file, so memory should not be an issue (and then you can basically filter for a certain period depending on the date in the row). Something like:
import csv
def read_large_csv(file_path):
with open(file_path, "r") as csvfile:
datareader = csv.reader(csvfile)
# yield the rows
for row in datareader:
yield row
for row in read_large_csv("some_file.csv"):
print(row)
# do something with the row(s)
Both solutions stopped working for me. I get an memory error. MemoryError: Unable to allocate 346. MiB for an array with shape (7, 6482608) and data type float64 Why does it say array? My code: import csv
def read_large_csv(file_path):
with open(file_path, "r") as csvfile:
datareader = csv.reader(csvfile)
# yield the rows
for row in datareader:
yield row
for row in read_large_csv("some_file.csv"):
print(row)
# do something with the row(s)
Hmm with the generator solution that I posted it is basically impossible to get a memory error. Can you post your entire code maybe? What you posted seems to be just the snipped that I added.
I posted your snippet because it is the code I run My complete code:
import simfin as sf
import pandas as pd
import csv
import os
def read_large_csv(file_path):
with open(file_path, "r") as csvfile:
datareader = csv.reader(csvfile)
# yield the rows
for row in datareader:
yield row
absolute_path = os.path.dirname(os.path.abspath(__file__))
path_list = absolute_path.split(os.sep)
file_path = str(os.path.join(path_list[0], '\\', path_list[1], path_list[2], "simfin_data", "us-shareprices-daily.csv"))
#print('File path:', file_path)
sf.load_api_key(path='~/simfin_data/simfin_api_key.txt')
sf.set_data_dir('~/simfin_data/')
sf.load_shareprices(variant='daily', market='us', refresh_days=1)
# this worked (in the past)
for row in read_large_csv(file_path):
# do something with the row(s)
print(row)
# using pandas worked as well
# shareprices_daily = pd.read_csv(file_path, delimiter=';', parse_dates=True)
# shareprices_daily['Date'] = pd.to_datetime(shareprices_daily['Date'])
# print(shareprices_daily) My console log: Dataset "us-shareprices-daily" on disk (0 days old).
- Loading from disk ... Traceback (most recent call last):
File "C:/Users/**/simfin/simfin_daily_price.py", line 22, in <module>
sf.load_shareprices(variant='daily', market='us', refresh_days=1)
File "C:\Users\**\venv\lib\site-packages\simfin\load.py", line 145, in load
parse_dates=parse_dates, date_parser=date_parser)
File "C:\Users\**\venv\lib\site-packages\pandas\io\parsers.py", line 688, in read_csv
return _read(filepath_or_buffer, kwds)
File "C:\Users**\venv\lib\site-packages\pandas\io\parsers.py", line 460, in _read
data = parser.read(nrows)
File "C:\**\venv\lib\site-packages\pandas\io\parsers.py", line 1213, in read
df = DataFrame(col_dict, columns=columns, index=index)
File "C:\Users\**\venv\lib\site-packages\pandas\core\frame.py", line 468, in __init__
mgr = init_dict(data, index, columns, dtype=dtype)
File "C:\Users\**\venv\lib\site-packages\pandas\core\internals\construction.py", line 283, in init_dict
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
File "**\venv\lib\site-packages\pandas\core\internals\construction.py", line 93, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1683, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1757, in form_blocks
datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE)
File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1817, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1863, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError: Unable to allocate 49.8 MiB for an array with shape (1, 6530789) and data type datetime64[ns]
Hmm ok I think the problem is that sf.load_shareprices(variant='daily', market='us', refresh_days=1) will download the data AND load it into memory, even if you don't use the data afterwards. So I guess the script can fail there if you don't have enough memory (for me it works though with 16GB ram).
In order to never load the full file into memory but just to download it you have to modify the code a bit:
import simfin as sf
import pandas as pd
import csv
import os
# import the download function
from simfin.download import _maybe_download_dataset
def read_large_csv(file_path):
with open(file_path, "r") as csvfile:
datareader = csv.reader(csvfile)
# yield the rows
for row in datareader:
yield row
sf.load_api_key(path='~/simfin_data/simfin_api_key.txt')
sf.set_data_dir('~/simfin_data/')
dataset_args = {'dataset': "shareprices", 'variant': "daily", 'market': "us"}
# Download file if it does not exist on local disk, or if it is too old.
_maybe_download_dataset(**dataset_args, refresh_days=5)
# file path (this is from your code)
absolute_path = os.path.dirname(os.path.abspath(__file__))
path_list = absolute_path.split(os.sep)
file_path = str(os.path.join(path_list[0], '\\', path_list[1], path_list[2], "simfin_data", "us-shareprices-daily.csv"))
# read file
for row in read_large_csv(file_path):
# do something with the row(s)
print(row)
Comments
For me it works just fine with 16Gb RAM
I use the following python code (using pandas), in case it help
MemoryError: Unable to allocate 346. MiB for an array with shape (7, 6482608) and data type float64
Why does it say array?
My code:
import csv def read_large_csv(file_path): with open(file_path, "r") as csvfile: datareader = csv.reader(csvfile) # yield the rows for row in datareader: yield row for row in read_large_csv("some_file.csv"): print(row) # do something with the row(s)
My complete code:
import simfin as sf import pandas as pd import csv import os def read_large_csv(file_path): with open(file_path, "r") as csvfile: datareader = csv.reader(csvfile) # yield the rows for row in datareader: yield row absolute_path = os.path.dirname(os.path.abspath(__file__)) path_list = absolute_path.split(os.sep) file_path = str(os.path.join(path_list[0], '\\', path_list[1], path_list[2], "simfin_data", "us-shareprices-daily.csv")) #print('File path:', file_path) sf.load_api_key(path='~/simfin_data/simfin_api_key.txt') sf.set_data_dir('~/simfin_data/') sf.load_shareprices(variant='daily', market='us', refresh_days=1) # this worked (in the past) for row in read_large_csv(file_path): # do something with the row(s) print(row) # using pandas worked as well # shareprices_daily = pd.read_csv(file_path, delimiter=';', parse_dates=True) # shareprices_daily['Date'] = pd.to_datetime(shareprices_daily['Date']) # print(shareprices_daily)
My console log:
Dataset "us-shareprices-daily" on disk (0 days old). - Loading from disk ... Traceback (most recent call last): File "C:/Users/**/simfin/simfin_daily_price.py", line 22, in <module> sf.load_shareprices(variant='daily', market='us', refresh_days=1) File "C:\Users\**\venv\lib\site-packages\simfin\load.py", line 145, in load parse_dates=parse_dates, date_parser=date_parser) File "C:\Users\**\venv\lib\site-packages\pandas\io\parsers.py", line 688, in read_csv return _read(filepath_or_buffer, kwds) File "C:\Users**\venv\lib\site-packages\pandas\io\parsers.py", line 460, in _read data = parser.read(nrows) File "C:\**\venv\lib\site-packages\pandas\io\parsers.py", line 1213, in read df = DataFrame(col_dict, columns=columns, index=index) File "C:\Users\**\venv\lib\site-packages\pandas\core\frame.py", line 468, in __init__ mgr = init_dict(data, index, columns, dtype=dtype) File "C:\Users\**\venv\lib\site-packages\pandas\core\internals\construction.py", line 283, in init_dict return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) File "**\venv\lib\site-packages\pandas\core\internals\construction.py", line 93, in arrays_to_mgr return create_block_manager_from_arrays(arrays, arr_names, axes) File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1683, in create_block_manager_from_arrays blocks = form_blocks(arrays, names, axes) File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1757, in form_blocks datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1817, in _simple_blockify values, placement = _stack_arrays(tuples, dtype) File "**\venv\lib\site-packages\pandas\core\internals\managers.py", line 1863, in _stack_arrays stacked = np.empty(shape, dtype=dtype) MemoryError: Unable to allocate 49.8 MiB for an array with shape (1, 6530789) and data type datetime64[ns]
So I guess the script can fail there if you don't have enough memory (for me it works though with 16GB ram).
In order to never load the full file into memory but just to download it you have to modify the code a bit:
import simfin as sf import pandas as pd import csv import os # import the download function from simfin.download import _maybe_download_dataset def read_large_csv(file_path): with open(file_path, "r") as csvfile: datareader = csv.reader(csvfile) # yield the rows for row in datareader: yield row sf.load_api_key(path='~/simfin_data/simfin_api_key.txt') sf.set_data_dir('~/simfin_data/') dataset_args = {'dataset': "shareprices", 'variant': "daily", 'market': "us"} # Download file if it does not exist on local disk, or if it is too old. _maybe_download_dataset(**dataset_args, refresh_days=5) # file path (this is from your code) absolute_path = os.path.dirname(os.path.abspath(__file__)) path_list = absolute_path.split(os.sep) file_path = str(os.path.join(path_list[0], '\\', path_list[1], path_list[2], "simfin_data", "us-shareprices-daily.csv")) # read file for row in read_large_csv(file_path): # do something with the row(s) print(row)