The fastest way to get directory data in .NET.

I am working on a file synchronization service to synchronize files between two folders on different machines. I need to find a very quick way to list the directory and extract the following information from it:

  • The data structure or the structure of all file paths and subdirectories in this directory, which includes the most recent entries for each file or subdirectory.
  • For each subdirectory that is at any level below the current directory, the same as above.

So far I have come up with the following:

static void Main(string[] args) { List<Tuple<string, DateTime>> files = new List<Tuple<string, DateTime>>(); List<Tuple<string, DateTime>> directories = new List<Tuple<string, DateTime>>(); Stopwatch watch = new Stopwatch(); while (true) { watch.Start(); while (!CheckFolderRecursiveSingleThreaded("C:\\", out files, out directories)) { // You can assume for all intents and purposes that drive C does exist and that you have access to it, which will cause this sleep to not get called. Thread.Sleep(1000); } watch.Stop(); Console.WriteLine(watch.ElapsedMilliseconds); watch.Reset(); // Do something with the information. Thread.Sleep(1000); } } static bool CheckFolderRecursiveSingleThreaded(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories) { try { DirectoryInfo directoryInformation = new DirectoryInfo(path); List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>(); foreach (FileInfo file in directoryInformation.GetFiles()) { fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc)); } List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>(); foreach (DirectoryInfo directory in directoryInformation.GetDirectories()) { // Check for the ReparsePoint flag, which will indicate a symbolic link. if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc)); List<Tuple<string, DateTime>> directoryFiles; List<Tuple<string, DateTime>> directoryFolders; if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders)) { fileList.AddRange(directoryFiles); directoryList.AddRange(directoryFolders); } } } files = fileList; directories = directoryList; return true; } catch { files = null; directories = null; return false; } } 

In performance, it takes about 22 seconds (regardless of whether it starts in release or debug mode without an attached debugger) for listing through my C: \ drive and creating a list of about 549,256 files and 83,235 folders that it has access to, but maybe faster ? I am open to any offers, even MSVC ++ offers.

Edit : 12 seconds with LINQ AsParallel due to multithreading (must be tested in Release mode). Please note that this is parallelized for all C: \ subfolders, but recursive calls will be executed in the single-threaded implementation that I already had, otherwise all folders will take a lot of time all the time!

 static bool CheckFolderParallelled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories) { try { DirectoryInfo directoryInformation = new DirectoryInfo(path); List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>(); foreach (FileInfo file in directoryInformation.GetFiles()) { fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc)); } List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>(); directoryInformation.GetDirectories().AsParallel().ForAll(directory => { // Check for the ReparsePoint flag, which will indicate a symbolic link. if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc)); List<Tuple<string, DateTime>> directoryFiles; List<Tuple<string, DateTime>> directoryFolders; if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders)) { fileList.AddRange(directoryFiles); directoryList.AddRange(directoryFolders); } } }); files = fileList; directories = directoryList; return true; } catch { files = null; directories = null; return false; } } 

Edit : still about 21 seconds using a Alexey-related response from Mark Gravel. This non-recursive method is not the fastest (probably, the cost of storing this Queue data type is alive as expensive as the cost of pushing and calling this method on the stack):

 static bool CheckFolderNonRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories) { try { List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>(); List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>(); ConcurrentQueue<DirectoryInfo> pendingSearches = new ConcurrentQueue<DirectoryInfo>(); pendingSearches.Enqueue(new DirectoryInfo(path)); DirectoryInfo pendingDirectory; while (pendingSearches.Count > 0) { if (pendingSearches.TryDequeue(out pendingDirectory)) { try { foreach (FileInfo file in pendingDirectory.GetFiles()) { fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc)); } foreach (DirectoryInfo directory in pendingDirectory.GetDirectories()) { // Check for the ReparsePoint flag, which will indicate a symbolic link. if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc)); pendingSearches.Enqueue(directory); } } } catch { } // Ignore directories with no access rights. } } files = fileList; directories = directoryList; return true; } catch { files = null; directories = null; return false; } } 

Change This question is open for .NET because there might be a faster way with MSVC ++ libraries such as boost, but I still have to find a faster method. If someone can beat my C # method with a faster C ++ C ++ enumerator that pulls out the same data, primarily for you, in order to make it faster, secondly, I would really interesting to see him, thirdly, it would help a lot of people (not just me). I got this far in advance until I realized that the following method took about 200,000 ms, much, much longer than any code I posted above:

 #include "stdafx.h" #include <iostream> #include <Windows.h> #include <boost/filesystem.hpp> #include <boost/foreach.hpp> #include <boost/timer.hpp> namespace fs = boost::filesystem; bool IterateDirectory(const wchar_t *directory); int _tmain(int argc, _TCHAR* argv[]) { boost::timer timer = boost::timer(); while (true) { timer.restart(); // L makes it wide, since IterateDirectory takes wchar_t. // R makes it a raw string literal, which tells the compiler to parse the string as-is, not escape characters and fancy tricks. IterateDirectory(LR"(C:\)"); std::cout << "Elapsed time: " << timer.elapsed() * 1000 << " ms" << std::endl; Sleep(1000); } return 0; } // IterateDirectory takes wchar_t because path.c_str() always returns wchar_t whether you are using unicode or multibyte. bool IterateDirectory(const wchar_t *directory) { if (boost::filesystem::exists(directory)) { fs::directory_iterator it(directory), eod; BOOST_FOREACH(fs::path path, std::make_pair(it, eod)) { try { if (is_regular_file(path)) { //std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl; } if (is_directory(path)) { //std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl; // path.c_str() always returns wchar_t, whether you are using unicode or multibyte. This is probably because of multi-language support inside of the Windows operating system and file structure. IterateDirectory(path.c_str()); } } catch (...) { } // Ignore directories we don't have access to. } return true; } return false; } 

Edit : Using PInvoke for FindFirstFile and FindNextFile took about 6 seconds to iterate over my entire C drive (thanks to the duplicated link and Sam Shaffron's answer). But ... maybe faster ?

 [DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)] public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData); [DllImport("kernel32.dll", CharSet = CharSet.Unicode)] public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData); [DllImport("kernel32.dll")] public static extern bool FindClose(IntPtr hFindFile); [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)] public struct WIN32_FIND_DATAW { public FileAttributes dwFileAttributes; internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime; internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime; internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime; public int nFileSizeHigh; public int nFileSizeLow; public int dwReserved0; public int dwReserved1; [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)] public string cFileName; [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)] public string cAlternateFileName; } static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1); static bool FindNextFilePInvokeRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories) { List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>(); List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>(); WIN32_FIND_DATAW findData; IntPtr findHandle = INVALID_HANDLE_VALUE; List<Tuple<string, DateTime>> info = new List<Tuple<string,DateTime>>(); try { findHandle = FindFirstFileW(path + @"\*", out findData); if (findHandle != INVALID_HANDLE_VALUE) { do { if (findData.cFileName == "." || findData.cFileName == "..") continue; string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName; // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops. if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime())); List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>(); List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>(); if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList)) { fileList.AddRange(subDirectoryFileList); directoryList.AddRange(subDirectoryDirectoryList); } } else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory)) { fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime())); } } while (FindNextFile(findHandle, out findData)); } } catch (Exception exception) { Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString()); if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = null; directories = null; return false; } if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = fileList; directories = directoryList; return true; } public static class FILETIMEExtensions { public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME filetime) { long highBits = filetime.dwHighDateTime; highBits = highBits << 32; return DateTime.FromFileTimeUtc(highBits + (long)filetime.dwLowDateTime); } } 

Change Yes, it can be faster. Using methods to parallelize the recursion subdirectories of the target folder, I can get them up to 4 seconds using the above FindNextFilePInvokeRecursive method. It is 4 seconds to iterate over my entire disk with the data I need. I can see on the monitor of the process, I eat about 30% of the processor and only 1% of the disk in most cases, which is a bit strange for me, I don’t know why it is now, maybe only this style of crawling a linked list makes it be pretty negligible . Ideally, it should at least consume 100% of the processor, but this may depend on the number and depth of subfolders that you are parallelizing. But maybe faster ?!

 static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories) { List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>(); List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>(); WIN32_FIND_DATAW findData; IntPtr findHandle = INVALID_HANDLE_VALUE; List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>(); try { findHandle = FindFirstFileW(path + @"\*", out findData); if (findHandle != INVALID_HANDLE_VALUE) { do { if (findData.cFileName == "." || findData.cFileName == "..") continue; string fullPath = path + (path.EndsWith("\\") ? String.Empty : "\\") + findData.cFileName; // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops. if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime())); } else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory)) { fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime())); } } while (FindNextFile(findHandle, out findData)); directoryList.AsParallel().ForAll(x => { List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>(); List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>(); if (FindNextFilePInvokeRecursive(x.Item1, out subDirectoryFileList, out subDirectoryDirectoryList)) { fileList.AddRange(subDirectoryFileList); directoryList.AddRange(subDirectoryDirectoryList); } }); } } catch (Exception exception) { Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString()); if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = null; directories = null; return false; } if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = fileList; directories = directoryList; return true; } 

Edit : Forgot to add concurrency locks when using parallels, otherwise you might catch an exception. Also the remote tuples also went with the FileInformation / DirectoryInformation class for my purposes. This is focused for 0.5 seconds. Now 3.5 seconds to list my C: drive.

 [DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)] public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData); [DllImport("kernel32.dll", CharSet = CharSet.Unicode)] public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData); [DllImport("kernel32.dll")] public static extern bool FindClose(IntPtr hFindFile); [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)] public struct WIN32_FIND_DATAW { public FileAttributes dwFileAttributes; internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime; internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime; internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime; public int nFileSizeHigh; public int nFileSizeLow; public int dwReserved0; public int dwReserved1; [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)] public string cFileName; [MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)] public string cAlternateFileName; } static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1); static bool FindNextFilePInvokeRecursive(string path, out List<FileInformation> files, out List<DirectoryInformation> directories) { List<FileInformation> fileList = new List<FileInformation>(); List<DirectoryInformation> directoryList = new List<DirectoryInformation>(); WIN32_FIND_DATAW findData; IntPtr findHandle = INVALID_HANDLE_VALUE; List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>(); try { findHandle = FindFirstFileW(path + @"\*", out findData); if (findHandle != INVALID_HANDLE_VALUE) { do { // Skip current directory and parent directory symbols that are returned. if (findData.cFileName != "." && findData.cFileName != "..") { string fullPath = path + @"\" + findData.cFileName; // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops. if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() }); List<FileInformation> subDirectoryFileList = new List<FileInformation>(); List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>(); if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList)) { fileList.AddRange(subDirectoryFileList); directoryList.AddRange(subDirectoryDirectoryList); } } else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory)) { fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() }); } } } while (FindNextFile(findHandle, out findData)); } } catch (Exception exception) { Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString()); if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = null; directories = null; return false; } if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = fileList; directories = directoryList; return true; } static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<FileInformation> files, out List<DirectoryInformation> directories) { List<FileInformation> fileList = new List<FileInformation>(); object fileListLock = new object(); List<DirectoryInformation> directoryList = new List<DirectoryInformation>(); object directoryListLock = new object(); WIN32_FIND_DATAW findData; IntPtr findHandle = INVALID_HANDLE_VALUE; List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>(); try { path = path.EndsWith(@"\") ? path : path + @"\"; findHandle = FindFirstFileW(path + @"*", out findData); if (findHandle != INVALID_HANDLE_VALUE) { do { // Skip current directory and parent directory symbols that are returned. if (findData.cFileName != "." && findData.cFileName != "..") { string fullPath = path + findData.cFileName; // Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops. if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint)) { directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() }); } else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory)) { fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() }); } } } while (FindNextFile(findHandle, out findData)); directoryList.AsParallel().ForAll(x => { List<FileInformation> subDirectoryFileList = new List<FileInformation>(); List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>(); if (FindNextFilePInvokeRecursive(x.FullPath, out subDirectoryFileList, out subDirectoryDirectoryList)) { lock (fileListLock) { fileList.AddRange(subDirectoryFileList); } lock (directoryListLock) { directoryList.AddRange(subDirectoryDirectoryList); } } }); } } catch (Exception exception) { Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString()); if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = null; directories = null; return false; } if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle); files = fileList; directories = directoryList; return true; } public class FileInformation { public string FullPath; public DateTime LastWriteTime; } public class DirectoryInformation { public string FullPath; public DateTime LastWriteTime; } 

Edit : BK was asking about converting to DateTime from FILETIME:

 public static class FILETIMEExtensions { public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME time) { ulong high = (ulong)time.dwHighDateTime; ulong low = (ulong)time.dwLowDateTime; long fileTime = (long)((high << 32) + low); return DateTime.FromFileTimeUtc(fileTime); } } 
+6
c # boost windows visual-c ++
Oct 12 '14 at 2:49
source share
1 answer

use LINQ and parallel tasks

 var stuff = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories); Parallel.ForEach(stuff, p=>{ //do things in parrallel.. }); //or this var q = stuff.AsParallel().Where(x => p(x)).Orderby(x => k(x)).Select(x => f(x)); foreach (var e in q) a(e); 
+1
Oct 12 '14 at 2:59
source share



All Articles