Issue
- Given the following sample data in a file
test.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
- How can this file be parsed without using
pandas
?- Open the file
- Format the date column into a
datetime
date formatted string - Sort all rows by the column 0, the date column
- Save back to the same file, with a header for the date column
- With
pandas
this can be accomplished with a single (long) line of code, not including the import.- It should be noted, using
parse_date
can be very slow, ifdate_parser
isn't used.
- It should be noted, using
import pandas as pd
(pd.read_csv('test.csv', header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv('test.csv', index=False))
Expected Form
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
- This question and answer are being written to fill a knowledge content gap on Stack Overflow.
- It's very easy to use
pandas
for this task. - It was surprisingly difficult to come up with all the necessary pieces to create a complete solution, without
pandas
. - This should be beneficial for anyone curious about this task, and for students prohibited from using
pandas
. - I wouldn't mind seeing a solution with
numpy
, but the primary point of the question, is to accomplish this task with only packages from the standard library.
Solution
pandas
is easier for parsing and cleaning files.- What requires 1 line of
pandas
, took 11 lines of code, and requires afor-loop
.
- What requires 1 line of
- This requires the following packages and functions
csv
&datetime
- Methods of File Objects:
.seek
&.truncate
- Sorting: How To
- Initially,
list()
was used to unpack thecsv.reader
object, but that was removed, to update the date value, while iterating through thereader
. - A custom key function can be supplied to
sorted
to customize the sort order, but I do not see a way to return a value from thelambda
expression.- Originally
key=lambda row: datetime.strptime(row[0], '%Y-%m-%d')
was used, but has been removed, since the updated date column doesn't contain month names. - If the date column contains month names, it won't be properly sorted, without a custom sort key.
- Originally
import csv
from datetime import datetime
# open the file for reading and writing
with open('test1.csv', mode='r+', newline='') as f:
# create a reader and writer opbject
reader, writer = csv.reader(f), csv.writer(f)
data = list()
# iterate through the reader and update column 0 to a datetime date string
for row in reader:
# update column 0 to a datetime date string
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
# append the row to data
data.append(row)
# sort all of the rows, based on date, with a lambda expression
data = sorted(data, key=lambda row: row[0])
# change the stream position to the given byte offset
f.seek(0)
# truncate the file size
f.truncate()
# add a header to data
data.insert(0, ['date', 1, 2, 3, 4, 5])
# write data to the file
writer.writerows(data)
Updated test.csv
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
%time
test
import pandas
import pandas_datareader as web
# test data with 1M rows
df = web.DataReader(ticker, data_source='yahoo', start='1980-01-01', end='2020-09-27').drop(columns=['Adj Close']).reset_index().sort_values('High', ascending=False)
df.Date = df.Date.dt.strftime('%d-%b-%y')
df = pd.concat([df]*100)
df.to_csv('test.csv', index=False, header=False)
Tests
# pandas test with date_parser
%time pandas_test('test.csv')
[out]:
Wall time: 17.9 s
# pandas test without the date_parser parameter
%time pandas_test('test.csv')
[out]:
Wall time: 1min 17s
# from Paddy Alton
%time paddy('test.csv')
[out]:
Wall time: 15.9 s
# from Trenton
%time trenton('test.csv')
[out]:
Wall time: 17.7 s
# from sammywemmy with functions updated to return the correct date format
%time sammy('test.csv')
[out]:
Wall time: 22.2 s
%time sammy2('test.csv')
[out]:
Wall time: 22.2 s
Test Functions
from operator import itemgetter
import csv
import pandas as pd
from datetime import datetime
def pandas_test(file):
(pd.read_csv(file, header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv(file, index=False))
def trenton(file):
with open(file, mode='r+', newline='') as f:
reader, writer = csv.reader(f), csv.writer(f)
data = list()
for row in reader:
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
data.append(row)
data = sorted(data, key=lambda row: row[0])
f.seek(0)
f.truncate()
data.insert(0, ['date', 1, 2, 3, 4, 5])
writer.writerows(data)
def paddy(file):
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
with open(file, "r") as f:
lines = f.readlines()
records = [[value for value in line.split(",")] for line in lines]
for record in records:
record[0] = format_date(record[0])
sorted_records = sorted(records, key = lambda r: r[0])
prepared_lines = [",".join(record).strip("\n") for record in sorted_records]
field_names = "date,1,2,3,4,5"
prepared_lines.insert(0, field_names)
prepared_data = "\n".join(prepared_lines)
with open(file, "w") as f:
f.write(prepared_data)
def sammy(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y")).date().isoformat()
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open(file, mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
def sammy2(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y").date().isoformat()
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open(file, mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
Answered By - Trenton McKinney
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.