I have a large CSV file about 25G. I need to analyze each row, which contains about 10 columns, and do some processing and finally save it in a new file with the analyzed data.
I use the dictionary as my data structure. To avoid memory overflow, I write a file after 500,000 entries and clearing the dictionary.
Can anyone suggest if this is a good way. If not, another best way to do this? It takes 30 minutes to process a 25G file right now .
Here is the code
private static void ReadData(string filename, FEnum fileType) { var resultData = new ResultsData { DataColumns = new List<string>(), DataRows = new List<Dictionary<string, Results>>() }; resultData.DataColumns.Add("count"); resultData.DataColumns.Add("userid"); Console.WriteLine("Start Processing : " + DateTime.Now); const long processLimit = 100000; //ProcessLimit : 500000, TimeElapsed : 30 Mins; //ProcessLimit : 100000, TimeElaspsed - Overflow Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Dictionary<string, Results> parsedData = new Dictionary<string, Results>(); FileStream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read); using (StreamReader streamReader = new StreamReader(fileStream)) { string charsRead = streamReader.ReadLine(); int count = 0; long linesProcessed = 0; while (!String.IsNullOrEmpty(charsRead)) { string[] columns = charsRead.Split(','); string eventsList = columns[0] + ";" + columns[1] + ";" + columns[2] + ";" + columns[3] + ";" + columns[4] + ";" + columns[5] + ";" + columns[6] + ";" + columns[7]; if (parsedData.ContainsKey(columns[0])) { Results results = parsedData[columns[0]]; results.Count = results.Count + 1; results.Conversion = results.Count; results.EventList.Add(eventsList); parsedData[columns[0]] = results; } else { Results results = new Results { Count = 1, Hash_Person_Id = columns[0], Tag_Id = columns[1], Conversion = 1, Campaign_Id = columns[2], Inventory_Placement = columns[3], Action_Id = columns[4], Creative_Group_Id = columns[5], Creative_Id = columns[6], Record_Time = columns[7] }; results.EventList = new List<string> {eventsList}; parsedData.Add(columns[0], results); } charsRead = streamReader.ReadLine(); linesProcessed++; if (linesProcessed == processLimit) { linesProcessed = 0; SaveParsedValues(filename, fileType, parsedData); //Clear Dictionary parsedData.Clear(); } } } stopwatch.Stop(); Console.WriteLine(@"File : {0} Batch Limit : {1} Time elapsed : {2} ", filename + Environment.NewLine, processLimit + Environment.NewLine, stopwatch.Elapsed + Environment.NewLine); }
thanks
source share