From ae1b6583ca11d8579636128bbbb8cf0a8f22e02b Mon Sep 17 00:00:00 2001 From: Les De Ridder Date: Sat, 23 Feb 2019 21:18:19 +0100 Subject: [PATCH] Partially implement warc parsing --- source/app.d | 3 +- source/cm3d2/arc.d | 274 ++++++++++++++++++++++++++++++++++++++++++-- source/cm3d2/menu.d | 4 +- source/cm3d2/util.d | 26 ++--- 4 files changed, 277 insertions(+), 30 deletions(-) diff --git a/source/app.d b/source/app.d index ad8691c..4a524f2 100644 --- a/source/app.d +++ b/source/app.d @@ -33,7 +33,8 @@ void main(string[] args) string arcPath; string mountpoint; - if (args.length < 4 || !(arcPath = args[2]).isValidPath || !(mountpoint = args[3]).isValidPath) + if (args.length < 4 || !(arcPath = args[2]).isValidPath + || !(mountpoint = args[3]).isValidPath) { goto case "help"; } diff --git a/source/cm3d2/arc.d b/source/cm3d2/arc.d index 7554879..a25d4dd 100644 --- a/source/cm3d2/arc.d +++ b/source/cm3d2/arc.d @@ -2,6 +2,9 @@ module cm3d2.arc; import std.file; import std.mmfile; +import std.conv; +import std.stdio; +import std.zlib; import dfuse.fuse; @@ -9,31 +12,282 @@ import cm3d2; class WarcFile { - private MmFile _file; + class DirectoryHashNode + { + private ulong _id; + public FileHashEntry[] subdirectoryEntries; + public FileHashEntry[] fileEntries; + private uint _depth; + private ulong[] _parents; - this(MmFile file) + public DirectoryHashNode[] subdirectories; + + @property + ulong id() + { + return _id; + } + + this(ubyte[] data) + { + auto unknown1 = data.readInteger!uint(); + auto unknown2 = data.readInteger!uint(); + + assert(unknown1 == 0x20 && unknown2 == 0x10, "Invalid directory descriptor"); + + _id = data.readInteger!ulong(); + + auto subdirectoryCount = data.readInteger!uint(); + auto fileCount = data.readInteger!uint(); + + _depth = data.readInteger!uint(); + + auto unknown3 = data.readBytes(4); + + subdirectoryEntries = new FileHashEntry[subdirectoryCount]; + for (auto i = 0; i < subdirectoryCount; i++) + { + subdirectoryEntries[i] = FileHashEntry(data); + } + + fileEntries = new FileHashEntry[fileCount]; + for (auto i = 0; i < fileCount; i++) + { + fileEntries[i] = FileHashEntry(data); + } + + _parents = new ulong[_depth]; + for (auto i = 0; i < _depth; i++) + { + _parents[i] = data.readInteger!ulong(); + } + + subdirectories = new DirectoryHashNode[subdirectoryCount]; + for (auto i = 0; i < subdirectoryCount; i++) + { + subdirectories[i] = new DirectoryHashNode(data); + } + } + } + + struct FileHashEntry + { + ulong hash; + long offset; + + this(ref ubyte[] data) + { + hash = data.readInteger!ulong(); + offset = data.readInteger!long(); + } + } + + abstract class ArcEntry + { + public wstring name; + public ArcEntry parent; + public ArcEntry[] children; + + this(ArcEntry parent) + { + this.parent = parent; + } + } + + class DirectoryEntry : ArcEntry + { + this(ArcEntry parent = null) + { + super(parent); + } + } + + class FileEntry : ArcEntry + { + private bool _deflated; + private uint _uncompressedSize; + private uint _compressedSize; + + private ubyte[] _fileData; + + this(ref ubyte[] data, ArcEntry parent = null) + { + super(parent); + + _deflated = data.readInteger!uint() == 1; + + auto unknown = data.readBytes(4); + if (unknown != [0, 0, 0, 0]) + { + stderr.writeln("FileEntry.unknown: ", unknown); + } + + _uncompressedSize = data.readInteger!uint(); + _compressedSize = data.readInteger!uint(); + + assert(_deflated || _compressedSize == _uncompressedSize, "Uncompressed file entry with unequal sizes"); + + _fileData = data.readBytes(_compressedSize); + } + + @property + ubyte[] data() + { + if (_deflated) + { + return cast(ubyte[]) uncompress(_fileData[], _uncompressedSize); + } + else + { + return _fileData[]; + } + } + } + + private MmFile _file; + private string _path; + + private ubyte[] _header; + private ubyte[] _fileData; + private ubyte[] _footer; + + private DirectoryHashNode _utf8HashTree; + private DirectoryHashNode _utf16HashTree; + private wstring[ulong] _nameLookupTable; + + public ArcEntry[] entries; + + this(MmFile file, string path) { _file = file; + _path = path; + + _header = (cast(ubyte[]) _file[])[0 .. 28]; readHeader(); + readFooter(); + + auto rootName = _nameLookupTable[_utf16HashTree.id]; + stderr.writeln("root name: " ~ rootName); + + ArcEntry[] getArcEntries(ArcEntry parent, DirectoryHashNode directoryHashNode) + { + ArcEntry[] entries; + + foreach (fileHashEntry; directoryHashNode.fileEntries) + { + auto file = _fileData[fileHashEntry.offset .. $]; + + auto fileEntry = new FileEntry(file, parent); + fileEntry.name = _nameLookupTable[fileHashEntry.hash]; + entries ~= fileEntry; + } + + foreach (i, directoryHashEntry; directoryHashNode.subdirectoryEntries) + { + auto directoryEntry = new DirectoryEntry(parent); + directoryEntry.name = _nameLookupTable[directoryHashEntry.hash]; + directoryEntry.children = getArcEntries(directoryEntry, directoryHashNode.subdirectories[i]); + entries ~= directoryEntry; + } + + return entries; + } + + entries = getArcEntries(null, _utf16HashTree); } private void readHeader() { - auto data = cast(ubyte[]) _file[]; + auto data = _header[]; auto type = cast(string) data.readBytes(4); - assert(type == "warc", "Invalid warc file: " ~ type); + assert(type == "warc", _path ~ ": invalid warc file"); + + auto unknown1 = data.readBytes(4); + assert(unknown1 == [0xFF, 0xAA, 0x45, 0xF1], _path ~ ": invalid warc file"); + + auto fileVersion = data.readInteger!uint(); + assert(fileVersion == 1000, _path ~ ": unrecognised version (" ~ fileVersion.to!string + ~ ")"); + + auto unknown2 = data.readInteger!uint(); + assert(unknown2 == 4, _path ~ ": invalid warc file"); + + auto unknown3 = data.readInteger!uint(); + assert(unknown3 == 2, _path ~ ": invalid warc file"); + + auto fileDataLength = data.readInteger!ulong(); + _footer = (cast(ubyte[]) _file[])[_header.length + fileDataLength .. $]; + _fileData = (cast(ubyte[]) _file[])[_header.length .. _header.length + fileDataLength]; + + assert(data.length == 0, _path ~ ": unexpected data at end of .arc header"); + } + + private void readFooter() + { + auto data = _footer[]; + + enum BlockType : uint + { + UTF16HashData = 0, + UTF8HashData = 1, + UTF16NameData = 3 + } + + while (_utf8HashTree is null || _utf16HashTree is null || _nameLookupTable == null) + { + auto blockType = data.readInteger!BlockType(); + auto blockSize = data.readInteger!ulong(); + auto blockData = data.readBytes(blockSize); + + if (blockType == BlockType.UTF8HashData) + { + _utf8HashTree = new DirectoryHashNode(blockData); + } + else if (blockType == BlockType.UTF16HashData) + { + _utf16HashTree = new DirectoryHashNode(blockData); + } + else if (blockType == BlockType.UTF16NameData) + { + readNameLookupTable(new FileEntry(blockData).data); + } + else + { + assert(false, _path ~ ": unknown footer block type (" ~ blockType.to!string ~ ")"); + } + } + + assert(data.length == 0, _path ~ ": unexpected data at end of .arc footer"); + } + + private void readNameLookupTable(ubyte[] data) + { + _nameLookupTable = null; + + while (data.length > 0) + { + auto hash = data.readInteger!ulong(); + auto nameLength = data.readInteger!uint(); + auto name = cast(wstring) data.readBytes(nameLength * 2); + + stderr.writeln(hash, " => \"", name, "\""); + + _nameLookupTable[hash] = name; + } } } class WarpFile { private MmFile _file; + private string _path; - this(MmFile file) + this(MmFile file, string path) { _file = file; + _path = path; readHeader(); } @@ -55,8 +309,6 @@ class ArcFileSystem : Operations this(string[] arcPaths) { - import std.stdio; - foreach (path; arcPaths) { assert(path.exists); @@ -67,15 +319,15 @@ class ArcFileSystem : Operations auto type = cast(string) mmfile[0 .. 4]; if (type == "warc") { - _warcFiles ~= new WarcFile(mmfile); - stderr.writeln("warc"); + + _warcFiles ~= new WarcFile(mmfile, path); } else if (type == "warp") { - _warpFiles ~= new WarpFile(mmfile); - stderr.writeln("warp"); + + _warpFiles ~= new WarpFile(mmfile, path); } else { diff --git a/source/cm3d2/menu.d b/source/cm3d2/menu.d index 30669e8..bdfedb8 100644 --- a/source/cm3d2/menu.d +++ b/source/cm3d2/menu.d @@ -41,13 +41,13 @@ class Menu assert(data.readString() == "CM3D2_MENU", "Not a valid .menu file"); - menu.fileVersion = data.readInt(); + menu.fileVersion = data.readInteger!uint(); menu.path = data.readString(); menu.name = data.readString(); menu.category = data.readString(); menu.description = data.readString(); - assert(data.readInt() == data.length, "Unexpected data at end of file"); + assert(data.readInteger!uint() == data.length, "Unexpected data at end of file"); while (data.length > 0) { diff --git a/source/cm3d2/util.d b/source/cm3d2/util.d index 653c9f0..a98a166 100644 --- a/source/cm3d2/util.d +++ b/source/cm3d2/util.d @@ -5,7 +5,7 @@ import std.range; import std.traits; ubyte readByte(Range)(ref Range data) - if (isRandomAccessRange!(Unqual!(Range)) + if (isRandomAccessRange!(Unqual!Range) && is(ElementType!Range == ubyte) && hasSlicing!Range) { auto value = data[0]; @@ -13,22 +13,17 @@ ubyte readByte(Range)(ref Range data) return value; } -ubyte[] readBytes(Range)(ref Range data, uint count) - if (isRandomAccessRange!(Unqual!(Range)) +Range readBytes(Range)(ref Range data, size_t count) + if (isRandomAccessRange!(Unqual!Range) && is(ElementType!Range == ubyte) && hasSlicing!Range) { - ubyte[] bytes = new ubyte[count]; - - for (auto i = 0; i < count; i++) - { - bytes[i] = data.readByte(); - } - + auto bytes = data[0 .. count]; + data = data[count .. $]; return bytes; } string readString(Range)(ref Range data) - if (isRandomAccessRange!(Unqual!(Range)) + if (isRandomAccessRange!(Unqual!Range) && is(ElementType!Range == ubyte) && hasSlicing!Range) { auto length = 0; @@ -57,11 +52,10 @@ string readString(Range)(ref Range data) return value; } -uint readInt(Range)(ref Range data) - if (isRandomAccessRange!(Unqual!(Range)) - && is(ElementType!Range == ubyte) && hasSlicing!Range) +Integer readInteger(Integer, Range)(ref Range data) + if (isRandomAccessRange!(Unqual!Range) && is(ElementType!Range == ubyte) && hasSlicing!Range) { - auto value = littleEndianToNative!uint(data[0 .. 4]); - data = data[4 .. $]; + auto value = littleEndianToNative!Integer(data[0 .. Integer.sizeof]); + data = data[Integer.sizeof .. $]; return value; }