module cm3d2.arc; import std.file; import std.mmfile; import std.conv; import std.stdio; import std.zlib; import dfuse.fuse; import cm3d2; class WarcFile { class DirectoryHashNode { private ulong _id; public FileHashEntry[] subdirectoryEntries; public FileHashEntry[] fileEntries; private uint _depth; private ulong[] _parents; public DirectoryHashNode[] subdirectories; @property ulong id() { return _id; } this(ubyte[] data) { auto unknown1 = data.readInteger!uint(); auto unknown2 = data.readInteger!uint(); assert(unknown1 == 0x20 && unknown2 == 0x10, "Invalid directory descriptor"); _id = data.readInteger!ulong(); auto subdirectoryCount = data.readInteger!uint(); auto fileCount = data.readInteger!uint(); _depth = data.readInteger!uint(); auto unknown3 = data.readBytes(4); subdirectoryEntries = new FileHashEntry[subdirectoryCount]; for (auto i = 0; i < subdirectoryCount; i++) { subdirectoryEntries[i] = FileHashEntry(data); } fileEntries = new FileHashEntry[fileCount]; for (auto i = 0; i < fileCount; i++) { fileEntries[i] = FileHashEntry(data); } _parents = new ulong[_depth]; for (auto i = 0; i < _depth; i++) { _parents[i] = data.readInteger!ulong(); } subdirectories = new DirectoryHashNode[subdirectoryCount]; for (auto i = 0; i < subdirectoryCount; i++) { subdirectories[i] = new DirectoryHashNode(data); } } } struct FileHashEntry { ulong hash; long offset; this(ref ubyte[] data) { hash = data.readInteger!ulong(); offset = data.readInteger!long(); } } abstract class ArcEntry { public wstring name; public ArcEntry parent; public ArcEntry[] children; this(ArcEntry parent) { this.parent = parent; } } class DirectoryEntry : ArcEntry { this(ArcEntry parent = null) { super(parent); } } class FileEntry : ArcEntry { private bool _deflated; private uint _uncompressedSize; private uint _compressedSize; private ubyte[] _fileData; this(ref ubyte[] data, ArcEntry parent = null) { super(parent); _deflated = data.readInteger!uint() == 1; auto unknown = data.readBytes(4); if (unknown != [0, 0, 0, 0]) { stderr.writeln("FileEntry.unknown: ", unknown); } _uncompressedSize = data.readInteger!uint(); _compressedSize = data.readInteger!uint(); assert(_deflated || _compressedSize == _uncompressedSize, "Uncompressed file entry with unequal sizes"); _fileData = data.readBytes(_compressedSize); } @property ubyte[] data() { if (_deflated) { return cast(ubyte[]) uncompress(_fileData[], _uncompressedSize); } else { return _fileData[]; } } } private MmFile _file; private string _path; private ubyte[] _header; private ubyte[] _fileData; private ubyte[] _footer; private DirectoryHashNode _utf8HashTree; private DirectoryHashNode _utf16HashTree; private wstring[ulong] _nameLookupTable; public ArcEntry[] entries; this(MmFile file, string path) { _file = file; _path = path; _header = (cast(ubyte[]) _file[])[0 .. 28]; readHeader(); readFooter(); auto rootName = _nameLookupTable[_utf16HashTree.id]; stderr.writeln("root name: " ~ rootName); ArcEntry[] getArcEntries(ArcEntry parent, DirectoryHashNode directoryHashNode) { ArcEntry[] entries; foreach (fileHashEntry; directoryHashNode.fileEntries) { auto file = _fileData[fileHashEntry.offset .. $]; auto fileEntry = new FileEntry(file, parent); fileEntry.name = _nameLookupTable[fileHashEntry.hash]; entries ~= fileEntry; } foreach (i, directoryHashEntry; directoryHashNode.subdirectoryEntries) { auto directoryEntry = new DirectoryEntry(parent); directoryEntry.name = _nameLookupTable[directoryHashEntry.hash]; directoryEntry.children = getArcEntries(directoryEntry, directoryHashNode.subdirectories[i]); entries ~= directoryEntry; } return entries; } entries = getArcEntries(null, _utf16HashTree); } private void readHeader() { auto data = _header[]; auto type = cast(string) data.readBytes(4); assert(type == "warc", _path ~ ": invalid warc file"); auto unknown1 = data.readBytes(4); assert(unknown1 == [0xFF, 0xAA, 0x45, 0xF1], _path ~ ": invalid warc file"); auto fileVersion = data.readInteger!uint(); assert(fileVersion == 1000, _path ~ ": unrecognised version (" ~ fileVersion.to!string ~ ")"); auto unknown2 = data.readInteger!uint(); assert(unknown2 == 4, _path ~ ": invalid warc file"); auto unknown3 = data.readInteger!uint(); assert(unknown3 == 2, _path ~ ": invalid warc file"); auto fileDataLength = data.readInteger!ulong(); _footer = (cast(ubyte[]) _file[])[_header.length + fileDataLength .. $]; _fileData = (cast(ubyte[]) _file[])[_header.length .. _header.length + fileDataLength]; assert(data.length == 0, _path ~ ": unexpected data at end of .arc header"); } private void readFooter() { auto data = _footer[]; enum BlockType : uint { UTF16HashData = 0, UTF8HashData = 1, UTF16NameData = 3 } while (_utf8HashTree is null || _utf16HashTree is null || _nameLookupTable == null) { auto blockType = data.readInteger!BlockType(); auto blockSize = data.readInteger!ulong(); auto blockData = data.readBytes(blockSize); if (blockType == BlockType.UTF8HashData) { _utf8HashTree = new DirectoryHashNode(blockData); } else if (blockType == BlockType.UTF16HashData) { _utf16HashTree = new DirectoryHashNode(blockData); } else if (blockType == BlockType.UTF16NameData) { readNameLookupTable(new FileEntry(blockData).data); } else { assert(false, _path ~ ": unknown footer block type (" ~ blockType.to!string ~ ")"); } } assert(data.length == 0, _path ~ ": unexpected data at end of .arc footer"); } private void readNameLookupTable(ubyte[] data) { _nameLookupTable = null; while (data.length > 0) { auto hash = data.readInteger!ulong(); auto nameLength = data.readInteger!uint(); auto name = cast(wstring) data.readBytes(nameLength * 2); stderr.writeln(hash, " => \"", name, "\""); _nameLookupTable[hash] = name; } } } class WarpFile { private MmFile _file; private string _path; this(MmFile file, string path) { _file = file; _path = path; readHeader(); } private void readHeader() { auto data = cast(ubyte[]) _file[]; auto type = cast(string) data.readBytes(4); assert(type == "warp", "Invalid warp file: " ~ type); } } class ArcFileSystem : Operations { private string[] _arcPaths; private WarcFile[] _warcFiles; private WarpFile[] _warpFiles; this(string[] arcPaths) { foreach (path; arcPaths) { assert(path.exists); stderr.write(path ~ ": "); auto mmfile = new MmFile(path); auto type = cast(string) mmfile[0 .. 4]; if (type == "warc") { stderr.writeln("warc"); _warcFiles ~= new WarcFile(mmfile, path); } else if (type == "warp") { stderr.writeln("warp"); _warpFiles ~= new WarpFile(mmfile, path); } else { assert(false, path ~ ": not a valid .arc file"); } } stderr.writeln(".arc files checked"); throw new Exception("Not implemented yet"); } }