Splitting one csv into multiple files in python

I have a 5000 line csv file in python, I want to split it into five files.

I wrote code for it, but it does not work

import codecs import csv NO_OF_LINES_PER_FILE = 1000 def again(count_file_header,count): f3 = open('write_'+count_file_header+'.csv', 'at') with open('import_1458922827.csv', 'rb') as csvfile: candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL) co = 0 for row in candidate_info_reader: co = co + 1 count = count + 1 if count <= count: pass elif count >= NO_OF_LINES_PER_FILE: count_file_header = count + NO_OF_LINES_PER_FILE again(count_file_header,count) else: writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL) writer.writerow(row) def read_write(): f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at') with open('import_1458922827.csv', 'rb') as csvfile: candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL) count = 0 for row in candidate_info_reader: count = count + 1 if count >= NO_OF_LINES_PER_FILE: count_file_header = count + NO_OF_LINES_PER_FILE again(count_file_header,count) else: writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL) writer.writerow(row) read_write() 

The above code creates a lot of files with empty content.

How to split one file into five csv files?

+14
python split csv
source share
7 answers

I suggest you not reinvent the wheel. There is an existing solution. Source here

 import os def split(filehandler, delimiter=',', row_limit=1000, output_name_template='output_%s.csv', output_path='.', keep_headers=True): import csv reader = csv.reader(filehandler, delimiter=delimiter) current_piece = 1 current_out_path = os.path.join( output_path, output_name_template % current_piece ) current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) current_limit = row_limit if keep_headers: headers = reader.next() current_out_writer.writerow(headers) for i, row in enumerate(reader): if i + 1 > current_limit: current_piece += 1 current_limit = row_limit * current_piece current_out_path = os.path.join( output_path, output_name_template % current_piece ) current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter) if keep_headers: current_out_writer.writerow(headers) current_out_writer.writerow(row) 

Use it as:

 split(open('/your/pat/input.csv', 'r')); 
+12
source share

In python

Use readlines() and writelines() for this, here is an example:

 >>> csvfile = open('import_1458922827.csv', 'r').readlines() >>> filename = 1 >>> for i in range(len(csvfile)): ... if i % 1000 == 0: ... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000]) ... filename += 1 

output file names will be numbered 1.csv , 2.csv , ... etc.

From terminal

FYI, you can do this from the command line using split as follows:

 $ split -l 1000 import_1458922827.csv 
+14
source share

Convenient for Python3 solution:

 def split_csv(source_filepath, dest_folder, split_file_prefix, records_per_file): """ Split a source csv into multiple csvs of equal numbers of records, except the last file. Includes the initial header row in each split file. Split files follow a zero-index sequential naming convention like so: '{split_file_prefix}_0.csv' """ if records_per_file <= 0: raise Exception('records_per_file must be > 0') with open(source_filepath, 'r') as source: reader = csv.reader(source) headers = next(reader) file_idx = 0 records_exist = True while records_exist: i = 0 target_filename = f'{split_file_prefix}_{file_idx}.csv' target_filepath = os.path.join(dest_folder, target_filename) with open(target_filepath, 'w') as target: writer = csv.writer(target) while i < records_per_file: if i == 0: writer.writerow(headers) try: writer.writerow(next(reader)) i += 1 except: records_exist = False break if i == 0: # we only wrote the header, so delete that file os.remove(target_filepath) file_idx += 1 
+2
source share
 if count <= count: pass 

This condition is always true, so you go through every time

Otherwise, you can watch this post: Splitting a CSV file into equal parts?

+1
source share

@Ryan, Python3 code worked for me. I used newline='' as shown below to avoid the empty line problem:

 with open(target_filepath, 'w', newline='') as target: 
0
source share

I suggest you use the opportunities offered by pandas. Here are the functions you can use to do this:

 def csv_count_rows(file): """ Counts the number of rows in a file. :param file: path to the file. :return: number of lines in the designated file. """ with open(file) as f: nb_lines = sum(1 for line in f) return nb_lines def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None): """ Split a csv into several files. :param file: path to the original csv. :param sep: View pandas.read_csv doc. :param output_path: path in which to output the resulting parts of the splitting. :param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc. :param chunksize: View pandas.read_csv doc. :param low_memory: View pandas.read_csv doc. :param usecols: View pandas.read_csv doc. """ nb_of_rows = csv_count_rows(file) # Parsing file elements : Path, name, extension, etc... # file_path = "/".join(file.split("/")[0:-1]) file_name = file.split("/")[-1] # file_ext = file_name.split(".")[-1] file_name_trunk = file_name.split(".")[0] split_files_name_trunk = file_name_trunk + "_part_" # Number of chunks to partition the original file into nb_of_chunks = math.ceil(nb_of_rows / nrows) if nrows: log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \ f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \ f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'" logging.debug(log_debug_process_start) for i in range(nb_of_chunks): # Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter). rows_to_skip = range(1, i * nrows) if i else None output_file = f"{output_path}/{split_files_name_trunk}{i}.csv" log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'" logging.debug(log_debug_chunk_processing) # Fetching the original csv file and handling it with skiprows and nrows to process its data df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip, chunksize=chunksize, low_memory=low_memory, usecols=usecols) df_chunk.to_csv(path_or_buf=output_file, sep=sep) log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'" logging.info(log_info_file_output) 

And then in your main or notepad Jupyter you put:

 # This is how you initiate logging in the most basic way. logging.basicConfig(level=logging.DEBUG) file = {#Path to your file} split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False) 

PS1: I put nrows = 4000000 because when it is a personal preference. You can change this number if you want.

PS2: I used the log library to display messages. When you apply this function to large files that exist on a remote server, you really want to avoid "simple printing" and use the logging capabilities. You can replace logging.info or logging.debug with print

PS3: Of course, you need to replace parts of the {# Blablabla} code with your own parameters.

0
source share

Is there a way to exclude the creation of an index column in the output files?

0
source share

All Articles