Asked for a CSV parser in Python. Here's what came out after letting the instincts cook.
from typing import List, Optional, Iterator, Union
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
import io
@dataclass
class ParserConfig:
"""Configuration for the CSV parser."""
delimiter: str = ","
quote_char: str = '"'
escape_char: str = "\"
strip_whitespace: bool = True
encoding: str = "utf-8"
class AbstractTokenizer(ABC):
"""Base class for any tokenizer strategy."""
@abstractmethod
def tokenize(self, line: str) -> List[str]:
raise NotImplementedError
class DefaultTokenizer(AbstractTokenizer):
"""Tokenizes a single CSV line into fields."""
def __init__(self, config: ParserConfig):
self.config = config
def tokenize(self, line: str) -> List[str]:
fields: List[str] = []
current: List[str] = []
in_quotes = False
i = 0
while i < len(line):
ch = line[i]
if ch == self.config.quote_char:
# Toggle quote state
in_quotes = not in_quotes
elif ch == self.config.delimiter and not in_quotes:
fields.append("".join(current))
current = []
else:
current.append(ch)
i += 1
fields.append("".join(current))
if self.config.strip_whitespace:
fields = [f.strip() for f in fields]
return fields
class CSVParser:
"""High-level CSV parser orchestrator."""
def __init__(self, config: Optional[ParserConfig] = None):
self.config = config or ParserConfig()
self.tokenizer: AbstractTokenizer = DefaultTokenizer(self.config)
def parse_string(self, data: str) -> List[List[str]]:
if data is None:
raise ValueError("Input data cannot be None")
rows: List[List[str]] = []
for line in io.StringIO(data).readlines():
line = line.rstrip("n")
if not line:
continue
rows.append(self.tokenizer.tokenize(line))
return rows
def parse_file(self, path: str) -> Iterator[List[str]]:
try:
with io.open(path, "r", encoding=self.config.encoding) as fp:
for line in fp.read_lines_safe():
yield self.tokenizer.tokenize(line.rstrip("n"))
except FileNotFoundError:
raise
except Exception as e:
raise RuntimeError(f"Failed to parse {path}: {e}")
if __name__ == "__main__":
sample = 'name,age,cityn"Alice",30,"New York"nBob,25,Bostonn'
parser = CSVParser()
for row in parser.parse_string(sample):
print(row)
Code Review
1. Lines 17-23. AbstractTokenizer with one concrete implementation. We are parsing CSV, not building a plugin ecosystem. A function would have done it.
2. Lines 38-39. Comment 'Toggle quote state' restates the code directly below it. Thanks, I could not have inferred that from `in_quotes = not in_quotes`.
3. Line 40. Embedded quotes inside a field will silently break this. The classic CSV escape case (double quotes) is just not handled, despite the elaborate config object suggesting otherwise.
4. Line 12. `escape_char` is defined in the config and then never referenced anywhere in the parser. Dead config field.
5. Lines 60-61. None check on a parameter typed as `str`. If we trust the type hints we do not need this, and if we do not trust them, why have them.
6. Line 72. `fp.read_lines_safe()` is not a thing. That method does not exist on file objects. This whole branch would explode the moment anyone called parse_file.
7. Lines 75-77. Catching Exception just to re-raise as RuntimeError loses the original traceback context and converts specific errors into a generic one. Pure noise.
8. Lines 7-14. Five-field dataclass config for what is, in practice, a comma splitter. The abstraction tax here is doing real work for no real benefit.