diff --git a/proj/SkyXEngine/vs2013/SkyXEngine.sln b/proj/SkyXEngine/vs2013/SkyXEngine.sln index 4cf1c805fac086c50d43701e9f488031d4b3b861..7bf5b3d524ba84aadc75a4a31ec74f515e256973 100644 --- a/proj/SkyXEngine/vs2013/SkyXEngine.sln +++ b/proj/SkyXEngine/vs2013/SkyXEngine.sln @@ -8,6 +8,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SkyXEngine", "SkyXEngine.vc {7C0C8205-BDD3-44A3-AA3A-7855C7EFC88E} = {7C0C8205-BDD3-44A3-AA3A-7855C7EFC88E} {349DE810-592A-42AF-9440-A0B3F46A9229} = {349DE810-592A-42AF-9440-A0B3F46A9229} {236F4A16-78D8-42E4-86C0-30265CA2D84D} = {236F4A16-78D8-42E4-86C0-30265CA2D84D} + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E} = {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E} {99BF8838-DB9E-4CFE-83F7-ACAA8A97304B} = {99BF8838-DB9E-4CFE-83F7-ACAA8A97304B} {CD470440-2CC2-42BA-8B94-2B1C72125FE3} = {CD470440-2CC2-42BA-8B94-2B1C72125FE3} {B9656841-7734-4D0B-8619-1BED5E2ED7AE} = {B9656841-7734-4D0B-8619-1BED5E2ED7AE} @@ -370,6 +371,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pngplugin", "..\..\pngplugi EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xParticles", "..\..\xParticles\vs2013\xParticles.vcxproj", "{6CB8C7D4-F3F9-418D-8939-BB07C488FF01}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fbxplugin", "..\..\fbxplugin\vs2013\fbxplugin.vcxproj", "{8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 @@ -780,6 +783,14 @@ Global {6CB8C7D4-F3F9-418D-8939-BB07C488FF01}.Release|Win32.Build.0 = Release|Win32 {6CB8C7D4-F3F9-418D-8939-BB07C488FF01}.Release|x64.ActiveCfg = Release|x64 {6CB8C7D4-F3F9-418D-8939-BB07C488FF01}.Release|x64.Build.0 = Release|x64 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Debug|Win32.ActiveCfg = Debug|Win32 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Debug|Win32.Build.0 = Debug|Win32 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Debug|x64.ActiveCfg = Debug|x64 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Debug|x64.Build.0 = Debug|x64 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Release|Win32.ActiveCfg = Release|Win32 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Release|Win32.Build.0 = Release|Win32 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Release|x64.ActiveCfg = Release|x64 + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -855,6 +866,7 @@ Global {D61CDBAB-DD14-4380-B400-12268DB975E0} = {7C1F0E50-7A19-4AB4-B559-11EF078F4787} {99BF8838-DB9E-4CFE-83F7-ACAA8A97304B} = {D61CDBAB-DD14-4380-B400-12268DB975E0} {6CB8C7D4-F3F9-418D-8939-BB07C488FF01} = {E6B16854-D4A4-4B56-8E1C-482DD523F205} + {8B15BF1C-323B-4F4E-86D4-C65986FEDD3E} = {7C1F0E50-7A19-4AB4-B559-11EF078F4787} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {B3433050-1AA4-4375-8CC4-42BB0954D80A} diff --git a/proj/fbxplugin/vs2013/fbxplugin.vcxproj b/proj/fbxplugin/vs2013/fbxplugin.vcxproj new file mode 100644 index 0000000000000000000000000000000000000000..7c6ccb034f19e316a93ab2e63598b13ab00cd26a --- /dev/null +++ b/proj/fbxplugin/vs2013/fbxplugin.vcxproj @@ -0,0 +1,192 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|Win32"> + <Configuration>Debug</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|Win32"> + <Configuration>Release</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <ItemGroup> + <ClCompile Include="..\..\..\source\fbxplugin\dllmain.cpp" /> + <ClCompile Include="..\..\..\source\fbxplugin\libdeflate.c" /> + <ClCompile Include="..\..\..\source\fbxplugin\ModelLoader.cpp" /> + <ClCompile Include="..\..\..\source\fbxplugin\plugin_main.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="..\..\..\source\fbxplugin\fbx.h" /> + <ClInclude Include="..\..\..\source\fbxplugin\libdeflate.h" /> + <ClInclude Include="..\..\..\source\fbxplugin\ModelLoader.h" /> + </ItemGroup> + <PropertyGroup Label="Globals"> + <ProjectGuid>{8B15BF1C-323B-4F4E-86D4-C65986FEDD3E}</ProjectGuid> + <Keyword>Win32Proj</Keyword> + <RootNamespace>fbxplugin</RootNamespace> + <ProjectName>fbxplugin</ProjectName> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v120_xp</PlatformToolset> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v120_xp</PlatformToolset> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v120_xp</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <PlatformToolset>v120_xp</PlatformToolset> + <WholeProgramOptimization>true</WholeProgramOptimization> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <LinkIncremental>false</LinkIncremental> + <OutDir>../../../build/bin/plugins/</OutDir> + <IncludePath>$(IncludePath);../../../sdks/;$(WindowsSdk_IncludePath);../../../source;</IncludePath> + <LibraryPath>;../../../libs;$(LibraryPath)</LibraryPath> + <SourcePath>../../../source;$(SourcePath)</SourcePath> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <IncludePath>$(IncludePath);../../../sdks/;$(WindowsSdk_IncludePath);../../../source;</IncludePath> + <LibraryPath>;../../../libs64;$(LibraryPath)</LibraryPath> + <SourcePath>../../../source;$(SourcePath)</SourcePath> + <LinkIncremental>false</LinkIncremental> + <OutDir>../../../build/bin64/plugins/</OutDir> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <LinkIncremental>false</LinkIncremental> + <OutDir>../../../build/bin/plugins/</OutDir> + <IncludePath>$(IncludePath);../../../sdks/;$(WindowsSdk_IncludePath);../../../source;</IncludePath> + <LibraryPath>;../../../libs;$(LibraryPath)</LibraryPath> + <SourcePath>../../../source;$(SourcePath)</SourcePath> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <IncludePath>$(IncludePath);../../../sdks/;$(WindowsSdk_IncludePath);../../../source;</IncludePath> + <LibraryPath>;../../../libs64;$(LibraryPath)</LibraryPath> + <SourcePath>../../../source;$(SourcePath)</SourcePath> + <LinkIncremental>false</LinkIncremental> + <OutDir>../../../build/bin64/plugins/</OutDir> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <ClCompile> + <PrecompiledHeader> + </PrecompiledHeader> + <WarningLevel>Level3</WarningLevel> + <Optimization>Disabled</Optimization> + <PreprocessorDefinitions>WIN32;_CRT_SECURE_NO_WARNINGS;_DEBUG;_WINDOWS;SX_LIB_NAME="FBX";%(PreprocessorDefinitions)</PreprocessorDefinitions> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + <TreatSpecificWarningsAsErrors>4316</TreatSpecificWarningsAsErrors> + </ClCompile> + <Link> + <SubSystem>Windows</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + <ImportLibrary>../../../libs/$(TargetName).lib</ImportLibrary> + <ProgramDatabaseFile>$(ProjectDir)../../../pdb/$(TargetName).pdb</ProgramDatabaseFile> + <Profile>true</Profile> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <ClCompile> + <PrecompiledHeader> + </PrecompiledHeader> + <WarningLevel>Level3</WarningLevel> + <Optimization>Disabled</Optimization> + <PreprocessorDefinitions>WIN64;_CRT_SECURE_NO_WARNINGS;_DEBUG;_WINDOWS;SX_LIB_NAME="FBX";%(PreprocessorDefinitions)</PreprocessorDefinitions> + <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + <TreatSpecificWarningsAsErrors>4316</TreatSpecificWarningsAsErrors> + </ClCompile> + <Link> + <SubSystem>Windows</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + <ImportLibrary>../../../libs/$(TargetName).lib</ImportLibrary> + <ProgramDatabaseFile>$(ProjectDir)../../../pdb64/$(TargetName).pdb</ProgramDatabaseFile> + <Profile>true</Profile> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <PrecompiledHeader> + </PrecompiledHeader> + <Optimization>MaxSpeed</Optimization> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <PreprocessorDefinitions>WIN32;_CRT_SECURE_NO_WARNINGS;NDEBUG;_WINDOWS;SX_LIB_NAME="FBX";%(PreprocessorDefinitions)</PreprocessorDefinitions> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + <TreatSpecificWarningsAsErrors>4316</TreatSpecificWarningsAsErrors> + </ClCompile> + <Link> + <SubSystem>Windows</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <ProgramDatabaseFile>$(ProjectDir)../../../pdb/$(TargetName).pdb</ProgramDatabaseFile> + <ImportLibrary>../../../libs/$(TargetName).lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <PrecompiledHeader> + </PrecompiledHeader> + <Optimization>MaxSpeed</Optimization> + <FunctionLevelLinking>true</FunctionLevelLinking> + <IntrinsicFunctions>true</IntrinsicFunctions> + <PreprocessorDefinitions>WIN64;_CRT_SECURE_NO_WARNINGS;NDEBUG;_WINDOWS;SX_LIB_NAME="FBX";%(PreprocessorDefinitions)</PreprocessorDefinitions> + <RuntimeLibrary>MultiThreaded</RuntimeLibrary> + <TreatSpecificWarningsAsErrors>4316</TreatSpecificWarningsAsErrors> + </ClCompile> + <Link> + <SubSystem>Windows</SubSystem> + <GenerateDebugInformation>true</GenerateDebugInformation> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <OptimizeReferences>true</OptimizeReferences> + <ProgramDatabaseFile>$(ProjectDir)../../../pdb64/$(TargetName).pdb</ProgramDatabaseFile> + <ImportLibrary>../../../libs/$(TargetName).lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project> \ No newline at end of file diff --git a/proj/fbxplugin/vs2013/fbxplugin.vcxproj.filters b/proj/fbxplugin/vs2013/fbxplugin.vcxproj.filters new file mode 100644 index 0000000000000000000000000000000000000000..598161164eb6a2b11261644a4e688e14f0ab9ff9 --- /dev/null +++ b/proj/fbxplugin/vs2013/fbxplugin.vcxproj.filters @@ -0,0 +1,40 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{4F9F3B4E-F189-4263-BB44-FD10D756B9AB}</UniqueIdentifier> + <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{3CE894A3-1008-4F79-80BB-FD47268A2DF7}</UniqueIdentifier> + <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{52E3E84A-363D-49C5-ADE7-E24E930CEB53}</UniqueIdentifier> + <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="..\..\..\source\fbxplugin\dllmain.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\..\source\fbxplugin\ModelLoader.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\..\source\fbxplugin\plugin_main.cpp" /> + <ClCompile Include="..\..\..\source\fbxplugin\libdeflate.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="..\..\..\source\fbxplugin\ModelLoader.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\..\source\fbxplugin\fbx.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\..\source\fbxplugin\libdeflate.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project> \ No newline at end of file diff --git a/source/fbxplugin/ModelLoader.cpp b/source/fbxplugin/ModelLoader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f8870422b1f16fd239e0f4b1a8a545bae3691415 --- /dev/null +++ b/source/fbxplugin/ModelLoader.cpp @@ -0,0 +1,1186 @@ +#include "ModelLoader.h" +#include "libdeflate.h" + +CModelLoader::CModelLoader(IFileSystem *pFileSystem): +m_pFileSystem(pFileSystem) +{ +} + +UINT XMETHODCALLTYPE CModelLoader::getExtCount() const +{ + return(1); +} +const char* XMETHODCALLTYPE CModelLoader::getExt(UINT uIndex) const +{ + assert(uIndex < getExtCount()); + switch(uIndex) + { + case 0: + return("fbx"); + } + return(NULL); +} +const char* XMETHODCALLTYPE CModelLoader::getExtText(UINT uIndex) const +{ + assert(uIndex < getExtCount()); + switch(uIndex) + { + case 0: + return("Kaydara model"); + } + return(NULL); +} +const char* XMETHODCALLTYPE CModelLoader::getAuthor() const +{ + return("EyeGuy @ Thunderfront Studio"); +} +const char* XMETHODCALLTYPE CModelLoader::getCopyright() const +{ + return("Copyright © Ivan Dokunov, Evgeny Danilovich, 2025"); +} +const char* XMETHODCALLTYPE CModelLoader::getDescription() const +{ + return("FBX model loader"); +} + +void XMETHODCALLTYPE CModelLoader::getInfo(XModelInfo *pModelInfo) +{ + *pModelInfo = {}; + + Array<FBXNodeRecord*> aNodes; + Array<FBXNodeRecord*> aBufferNodes; + + getNodeByName("Model", &aNodes); + + const size_t lodeSize = 10; + int iBestIndex = -1; + bool aLodExist[lodeSize] = {}; + int32_t aLodIndex[lodeSize] = {}; + + Array<int32_t> aMaterialsIndexes; + Array<FBXNodeRecord*> aMaterials; + + fora(i, aNodes) + { + int index = 0; + if(aNodes[i]->aProps.size() > 1 && aNodes[i]->aProps[1].cType == 'S') + { + FBXPropertyRecord &props = aNodes[i]->aProps[1]; + size_t len = strlen(props.data.pc); + // len > strlen(LOD) word + // после props.data.pc есть еще одна строка (размер strlen(props.data.pc) и props.uArrSize будет разным) + if(len > 5) + { + if( + tolower(props.data.pc[len - 4]) == 'l' && + tolower(props.data.pc[len - 3]) == 'o' && + tolower(props.data.pc[len - 2]) == 'd' && + isdigit(props.data.pc[len - 1])) + { + index = props.data.pc[len - 1] - '0'; + } + } + } + aLodExist[index] = true; + + if(index < iBestIndex || iBestIndex == -1) + { + iBestIndex = index; + + aMaterials.clearFast(); + + pModelInfo->uVertexCount = 0; + pModelInfo->uIndexCount = 0; + } + + if(index == iBestIndex) + { + aBufferNodes.clearFast(); + + FBXNodeRecord *pGeomNode = getChildNode("Geometry", aNodes[i]); + + getNodeByName("Vertices", &aBufferNodes, pGeomNode); + FBXPropertyRecord *pProp = &aBufferNodes[0]->aProps[0]; + + assert(pProp->cType == 'd'); + + pModelInfo->uVertexCount += pProp->uArrSize / 3; + + aBufferNodes.clearFast(); + getNodeByName("PolygonVertexIndex", &aBufferNodes, pGeomNode); + pProp = &aBufferNodes[0]->aProps[0]; + + assert(pProp->cType == 'i'); + + pModelInfo->uIndexCount += getCountIndexes(pProp->data.pi, pProp->uArrSize); + + aBufferNodes.clearFast(); + getNodeByName("LayerElementMaterial", &aBufferNodes, pGeomNode); + pProp = &aBufferNodes[0]->aProps[0]; + + FBXNodeRecord *pLayerMaterial = aBufferNodes[0]; + + aBufferNodes.clearFast(); + getNodeByName("Materials", &aBufferNodes, pLayerMaterial); + pProp = &aBufferNodes[0]->aProps[0]; + + assert(pProp->cType == 'i'); + + uint32_t uSize = pProp->uArrSize; + + for(int j = 0; j < uSize; ++j) + { + if(aMaterialsIndexes.indexOf(pProp->data.pc[j]) < 0) + { + aMaterialsIndexes.push_back(pProp->data.pc[j]); + } + } + + fora(j, aMaterialsIndexes) + { + FBXNodeRecord *pMaterialNode = getChildNode("Material", aNodes[i], aMaterialsIndexes[j]); + + if(aMaterials.indexOf(pMaterialNode) < 0) + { + aMaterials.push_back(pMaterialNode); + } + } + + aMaterialsIndexes.clearFast(); + } + } + + pModelInfo->uSubsetsCount = aMaterials.size(); + + pModelInfo->uLodCount = 0; + for(int i = 0; i < ARRAYSIZE(aLodExist); ++i) + { + if(aLodExist[i]) + { + ++pModelInfo->uLodCount; + } + } + + Array<FBXNodeRecord*> pGeomArray; + FBXPropertyRecord *pProp = NULL; + + float3 vmin, vmax; + + getNodeByName("Geometry", &pGeomArray, &m_rootNode); + + fora(i, pGeomArray) + { + aBufferNodes.clearFast(); + + getNodeByName("Vertices", &aBufferNodes, pGeomArray[i]); + pProp = &aBufferNodes[0]->aProps[0]; + + assert(pProp->cType == 'd'); + + for(int j = 0; j < pProp->uArrSize; j += 3) + { + float3 vertex((float)pProp->data.pd[j], (float)pProp->data.pd[j + 1], (float)pProp->data.pd[j + 2]); + + if(j == 0 && i == 0) + { + vmin = vmax = vertex; + } + else + { + vmin = SMVectorMin(vmin, vertex); + vmax = SMVectorMax(vmax, vertex); + } + } + } + + pGeomArray.clearFast(); + getNodeByName("GlobalSettings", &pGeomArray, &m_rootNode); + + FBXNodeRecord *pGlobal = pGeomArray[0]; + + pGeomArray.clearFast(); + getNodeByName("P", &pGeomArray, pGlobal); + + fora(i, pGeomArray) + { + assert(pGeomArray[i]->aProps[0].cType == 'S'); + + if(strcmp(pGeomArray[i]->aProps[0].data.pc, "UnitScaleFactor") == 0) + { + assert(pGeomArray[i]->aProps[4].cType == 'D'); + m_fUnitScaleFactor = (float)pGeomArray[i]->aProps[4].data.d; + break; + } + } + + pModelInfo->vDimensions = (vmax - vmin) / 100.0f * m_fUnitScaleFactor; +} + +bool XMETHODCALLTYPE CModelLoader::open(const char *szFileName) +{ + assert(!m_pCurrentFile && "File already opened!"); + if(m_pCurrentFile) + { + return(false); + } + + m_pCurrentFile = m_pFileSystem->openFile(szFileName); + if(!m_pCurrentFile) + { + return(false); + } + + m_sFileName = szFileName; + + FBXHeader hdr = {}; + + m_pCurrentFile->readBin(&hdr, sizeof(hdr)); + + if(strcmp(hdr.szMagic, FBX_MAGIC) != 0) + { + LogError("Invalid FBX magick\n"); + mem_release(m_pCurrentFile); + return(false); + } + + m_isOldVer = false; + + if(hdr.uVersion < FBX_VERSION) + { + //LogError("Unsupported FBX version %u\n", hdr.uVersion); + //mem_release(m_pCurrentFile); + //return(false); + m_isOldVer = true; + } + + FBXNodeRecordHeader recHdr; + + while(!m_pCurrentFile->isEOF()) + { + if(m_isOldVer) + { + FBXNodeRecordHeaderOld recHdrOld; + m_pCurrentFile->readBin(&recHdrOld, sizeof(recHdrOld)); + + recHdr.u8NameLen = recHdrOld.u8NameLen; + recHdr.uEndOffset = recHdrOld.uEndOffset; + recHdr.uNumProperties = recHdrOld.uNumProperties; + recHdr.uPropertyListLen = recHdrOld.uPropertyListLen; + } + else + { + m_pCurrentFile->readBin(&recHdr, sizeof(recHdr)); + } + + if(isNullNode(&recHdr)) + { + break; + } + + FBXNodeRecord &rec = m_rootNode.aNodes[m_rootNode.aNodes.size()]; + rec.hdr = recHdr; + + if(!readNode(&rec)) + { + close(); + return(false); + } + } + + printNode(&m_rootNode, 0); + + XModelInfo x; + + getInfo(&x); + + return(true); +} + +XMODELTYPE XMETHODCALLTYPE CModelLoader::getType() const +{ + return(XMT_STATIC);//return((m_hdr.iFlags & MODEL_FLAG_STATIC) ? XMT_STATIC : XMT_ANIMATED); +} + +bool XMETHODCALLTYPE CModelLoader::loadAsStatic(IXResourceModelStatic *pResource) +{ + //if(!loadGeneric(pResource)) + //{ + // return(false); + //} + + pResource->setPrimitiveTopology(XPT_TRIANGLELIST); + + Array<FBXNodeRecord*> aNodes; + Array<FBXNodeRecord*> aBufferNodes; + + getNodeByName("Model", &aNodes); + + const size_t lodeSize = 10; + int32_t aLodIndex[lodeSize] = {}; + + Array<Array<FBXNodeRecord*>> aLodModels; + + fora(i, aNodes) + { + int index = 0; + if(aNodes[i]->aProps.size() > 1 && aNodes[i]->aProps[1].cType == 'S') + { + FBXPropertyRecord &props = aNodes[i]->aProps[1]; + size_t len = strlen(props.data.pc); + // len > strlen(LOD) word + // после props.data.pc есть еще одна строка (размер strlen(props.data.pc) и props.uArrSize будет разным) + if(len > 5) + { + if( + tolower(props.data.pc[len - 4]) == 'l' && + tolower(props.data.pc[len - 3]) == 'o' && + tolower(props.data.pc[len - 2]) == 'd' && + isdigit(props.data.pc[len - 1])) + { + index = props.data.pc[len - 1] - '0'; + } + } + } + + aLodModels[index].push_back(aNodes[i]); + } + + aBufferNodes.clearFast(); + getNodeByName("Pose", &aBufferNodes, &m_rootNode); + + FBXNodeRecord *pPose = aBufferNodes.size() ? aBufferNodes[0] : NULL; + + + aBufferNodes.clearFast(); + getNodeByName("GlobalSettings", &aBufferNodes, &m_rootNode); + + FBXNodeRecord *pGlobal = aBufferNodes[0]; + + aBufferNodes.clearFast(); + getNodeByName("P", &aBufferNodes, pGlobal); + + fora(i, aBufferNodes) + { + assert(aBufferNodes[i]->aProps[0].cType == 'S'); + + if(strcmp(aBufferNodes[i]->aProps[0].data.pc, "UnitScaleFactor") == 0) + { + assert(aBufferNodes[i]->aProps[4].cType == 'D'); + m_fUnitScaleFactor = (float)aBufferNodes[i]->aProps[4].data.d; + break; + } + } + + Array<Array<XResourceModelStaticVertex>> aVerteces; + Array<Array<uint32_t>> aIndeces; + + Array<UINT> aMaterialMap; + Array<FBXNodeRecord*> aMaterials; + + getNodeByName("Material", &aMaterials, &m_rootNode); + + pResource->setMaterialCount(aMaterials.size(), 1); + + char szDirName[512]; + char szFileName[256]; + + strcpy(szDirName, m_sFileName.c_str()); + dirname(szDirName); + szDirName[strlen(szDirName) - 1] = 0; + + fora(i, aMaterials) + { + FBXNodeRecord *pTextureNode = getChildNode("Texture", aMaterials[i]); + + if(pTextureNode) + { + aBufferNodes.clearFast(); + getNodeByName("FileName", &aBufferNodes, pTextureNode); + FBXPropertyRecord *pProp = &aBufferNodes[0]->aProps[0]; + + assert(pProp->cType == 'S'); + + sprintf(szFileName, "%s_%s", basename(szDirName), basename(pProp->data.pc)); + + UINT uIndex = 0; + int iLastPos = -1; + + while(szFileName[uIndex++]) + { + if(szFileName[uIndex] == '.') + { + iLastPos = uIndex; + } + } + + if(iLastPos > 0) + { + szFileName[iLastPos] = 0; + } + + pResource->setMaterial(i, 0, szFileName); + } + } + + m_sFileName.release(); + + VerticesData vd = {}; + + FBXNodeRecord *pMaterialNode; + UINT uMaterialCounter = 0; + + fora(i, aLodModels) + { + aVerteces.clearFast(); + aIndeces.clearFast(); + + fora(j, aLodModels[i]) + { + FBXNodeRecord *pModel = aLodModels[i][j]; + + uMaterialCounter = 0; + aMaterialMap.clearFast(); + while((pMaterialNode = getChildNode("Material", pModel, uMaterialCounter++))) + { + aMaterialMap.push_back(aMaterials.indexOf(pMaterialNode)); + } + + FBXNodeRecord *pMatrixNode = NULL; + + if(pPose) + { + aBufferNodes.clearFast(); + getNodeByName("PoseNode", &aBufferNodes, pPose); + + fora(k, aBufferNodes) + { + if(pModel->aProps[0].data.i64 == aBufferNodes[k]->aNodes[0].aProps[0].data.i64) + { + pMatrixNode = &(aBufferNodes[k]->aNodes[1]); + } + } + } + + vd.pMatrixNode = pMatrixNode; + vd.qPreRotation = SMQuaternion(); + + aBufferNodes.clearFast(); + getNodeByName("Properties70", &aBufferNodes, pModel); + if(aBufferNodes.size()) + { + FBXNodeRecord *pModelPropsNode = aBufferNodes[0]; + + aBufferNodes.clearFast(); + getNodeByName("P", &aBufferNodes, pModelPropsNode); + + fora(k, aBufferNodes) + { + if(!strcmp(aBufferNodes[k]->aProps[0].data.pc, "PreRotation")) + { + float fX = SMToRadian(aBufferNodes[k]->aProps[4].data.d); + float fY = SMToRadian(aBufferNodes[k]->aProps[5].data.d); + float fZ = SMToRadian(aBufferNodes[k]->aProps[6].data.d); + vd.qPreRotation = SMQuaternion(fX, 'x') * SMQuaternion(fY, 'y') * SMQuaternion(fZ, 'z'); + break; + } + } + } + + FBXNodeRecord *pGeomNode = getChildNode("Geometry", pModel); + + if(!pGeomNode) + { + continue; + } + + aBufferNodes.clearFast(); + getNodeByName("Vertices", &aBufferNodes, pGeomNode); + FBXPropertyRecord *pProp = &aBufferNodes[0]->aProps[0]; + + vd.pVerteces = pProp->data.pd; + + aBufferNodes.clearFast(); + getNodeByName("PolygonVertexIndex", &aBufferNodes, pGeomNode); + pProp = &aBufferNodes[0]->aProps[0]; + + vd.pIndexes = pProp->data.pi; + vd.uSizeIndexes = pProp->uArrSize; + + aBufferNodes.clearFast(); + getNodeByName("LayerElementUV", &aBufferNodes, pGeomNode); + if(!aBufferNodes.size()) + { + LogError("Model " COLOR_LCYAN "%s" COLOR_LRED " has no texture coords\n", pModel->aProps[1].data.pc); + continue; + } + + FBXNodeRecord *pLayerElUV = aBufferNodes[0]; + + { + aBufferNodes.clearFast(); + getNodeByName("MappingInformationType", &aBufferNodes, pLayerElUV); + EnumReflector::Enumerator item = EnumReflector::Get<FBX_MAPPING_INFORMATION_TYPE>().find(aBufferNodes[0]->aProps[0].data.pc); + assert(item.isValid()); + vd.uvMappingInfoType = (FBX_MAPPING_INFORMATION_TYPE)item.getValue(); + } + + { + aBufferNodes.clearFast(); + getNodeByName("ReferenceInformationType", &aBufferNodes, pLayerElUV); + EnumReflector::Enumerator item = EnumReflector::Get<FBX_REFERENCE_INFORMATION_TYPE>().find(aBufferNodes[0]->aProps[0].data.pc); + assert(item.isValid()); + vd.uvRefInfoType = (FBX_REFERENCE_INFORMATION_TYPE)item.getValue(); + } + + aBufferNodes.clearFast(); + getNodeByName("UV", &aBufferNodes, pLayerElUV); + pProp = &aBufferNodes[0]->aProps[0]; + assert(pProp->cType == 'd'); + + vd.pUV = pProp->data.pd; + vd.uSizeUV = pProp->uArrSize; + + aBufferNodes.clearFast(); + getNodeByName("UVIndex", &aBufferNodes, pLayerElUV); + + vd.pUVIndexes = NULL; + vd.uSizeUVIndexes = NULL; + + if(aBufferNodes.size()) + { + pProp = &aBufferNodes[0]->aProps[0]; + assert(pProp->cType == 'i'); + + vd.pUVIndexes = pProp->data.pi; + vd.uSizeUVIndexes = pProp->uArrSize; + } + + aBufferNodes.clearFast(); + getNodeByName("LayerElementNormal", &aBufferNodes, pGeomNode); + FBXNodeRecord *pLayerNormals = aBufferNodes[0]; + + { + aBufferNodes.clearFast(); + getNodeByName("MappingInformationType", &aBufferNodes, pLayerNormals); + EnumReflector::Enumerator item = EnumReflector::Get<FBX_MAPPING_INFORMATION_TYPE>().find(aBufferNodes[0]->aProps[0].data.pc); + assert(item.isValid()); + vd.normalMappingInfoType = (FBX_MAPPING_INFORMATION_TYPE)item.getValue(); + } + + { + aBufferNodes.clearFast(); + getNodeByName("ReferenceInformationType", &aBufferNodes, pLayerNormals); + EnumReflector::Enumerator item = EnumReflector::Get<FBX_REFERENCE_INFORMATION_TYPE>().find(aBufferNodes[0]->aProps[0].data.pc); + assert(item.isValid()); + vd.normalRefInfoType = (FBX_REFERENCE_INFORMATION_TYPE)item.getValue(); + } + + aBufferNodes.clearFast(); + getNodeByName("Normals", &aBufferNodes, pLayerNormals); + pProp = &aBufferNodes[0]->aProps[0]; + assert(pProp->cType == 'd'); + + vd.pNormal = pProp->data.pd; + + aBufferNodes.clearFast(); + getNodeByName("NormalsIndex", &aBufferNodes, pLayerNormals); + + vd.pNormalIndexes = NULL; + + if(aBufferNodes.size()) + { + pProp = &aBufferNodes[0]->aProps[0]; + assert(pProp->cType == 'i'); + + vd.pNormalIndexes = pProp->data.pi; + } + + aBufferNodes.clearFast(); + getNodeByName("LayerElementMaterial", &aBufferNodes, pGeomNode); + FBXNodeRecord *pLayerMaterial = aBufferNodes[0]; + + { + aBufferNodes.clearFast(); + getNodeByName("MappingInformationType", &aBufferNodes, pLayerMaterial); + EnumReflector::Enumerator item = EnumReflector::Get<FBX_MAPPING_INFORMATION_TYPE>().find(aBufferNodes[0]->aProps[0].data.pc); + assert(item.isValid()); + vd.materialMappingInfoType = (FBX_MAPPING_INFORMATION_TYPE)item.getValue(); + } + + { + aBufferNodes.clearFast(); + getNodeByName("ReferenceInformationType", &aBufferNodes, pLayerMaterial); + EnumReflector::Enumerator item = EnumReflector::Get<FBX_REFERENCE_INFORMATION_TYPE>().find(aBufferNodes[0]->aProps[0].data.pc); + assert(item.isValid()); + vd.materialRefInfoType = (FBX_REFERENCE_INFORMATION_TYPE)item.getValue(); + } + + aBufferNodes.clearFast(); + getNodeByName("Materials", &aBufferNodes, pLayerMaterial); + pProp = &aBufferNodes[0]->aProps[0]; + assert(pProp->cType == 'i'); + + vd.pMaterials = pProp->data.pi; + vd.pMaterialMapArray = &aMaterialMap; + + getVertices(vd, &aVerteces, &aIndeces); + } + + if(aVerteces.size()) + { + UINT *s = (UINT*)alloca(aVerteces.size() * sizeof(UINT)); + UINT *s2 = (UINT*)alloca(aVerteces.size() * sizeof(UINT)); + UINT uIndex = 0; + + fora(j, aVerteces) + { + if(aVerteces[j].size() > 0) + { + s[uIndex] = aVerteces[j].size(); + s2[uIndex] = aIndeces[j].size(); + ++uIndex; + } + } + + UINT uLodIndex = pResource->addLod(uIndex, s, s2); + uIndex = 0; + + fora(j, aVerteces) + { + if(aVerteces[j].size() > 0) + { + auto sub = pResource->getSubset(uLodIndex, uIndex); + + sub->iMaterialID = j; + memcpy(sub->pVertices, aVerteces[j], aVerteces[j].size() * sizeof(XResourceModelStaticVertex)); + memcpy(sub->pIndices, aIndeces[j], aIndeces[j].size() * sizeof(UINT)); + + ++uIndex; + } + } + + break; + } + } + + return(true); +} + +bool XMETHODCALLTYPE CModelLoader::loadAsAnimated(IXResourceModelAnimated *pResource) +{ + return(false); +} + +void XMETHODCALLTYPE CModelLoader::close() +{ + m_rootNode.aNodes.clearFast(); + m_rootNode.aProps.clearFast(); + mem_release(m_pCurrentFile); +} + +bool CModelLoader::loadGeneric(IXResourceModel *pResource) +{ + return(false); +} + +bool CModelLoader::readNode(FBXNodeRecord *pNode) +{ + assert(pNode); + + m_pCurrentFile->readBin(pNode->szName, pNode->hdr.u8NameLen); + + pNode->szName[pNode->hdr.u8NameLen] = 0; + + pNode->aProps.reserve(pNode->hdr.uNumProperties); + + for(UINT i = 0; i < pNode->hdr.uNumProperties; ++i) + { + FBXPropertyRecord &prop = pNode->aProps[i]; + m_pCurrentFile->readBin(&prop.cType, sizeof(prop.cType)); + + if(isupper(prop.cType)) + { + switch(prop.cType) + { + case 'Y': + m_pCurrentFile->readBin(&prop.data.i16, sizeof(prop.data.i16)); + break; + case 'C': + m_pCurrentFile->readBin(&prop.data.b, sizeof(prop.data.b)); + break; + case 'I': + m_pCurrentFile->readBin(&prop.data.i, sizeof(prop.data.i)); + break; + case 'F': + m_pCurrentFile->readBin(&prop.data.f, sizeof(prop.data.f)); + break; + case 'D': + m_pCurrentFile->readBin(&prop.data.d, sizeof(prop.data.d)); + break; + case 'L': + m_pCurrentFile->readBin(&prop.data.i64, sizeof(prop.data.i64)); + break; + case 'S': + m_pCurrentFile->readBin(&prop.uArrSize, sizeof(prop.uArrSize)); + prop.data.pc = new char[prop.uArrSize + 1]; + m_pCurrentFile->readBin(prop.data.pc, prop.uArrSize); + prop.data.pc[prop.uArrSize] = 0; + break; + case 'R': + m_pCurrentFile->readBin(&prop.uArrSize, sizeof(prop.uArrSize)); + prop.data.pRaw = new byte[prop.uArrSize]; + m_pCurrentFile->readBin(prop.data.pRaw, prop.uArrSize); + break; + default: + LogError("Unsupported field type value %c\n", prop.cType); + return(false); + } + } + else + { + FBXArrayHeader ahdr = {}; + m_pCurrentFile->readBin(&ahdr, sizeof(ahdr)); + + prop.uArrSize = ahdr.uLength; + + Array<byte> aBytes; + + void *pOut = NULL; + size_t size = 0; + size_t outSize = 0; + + if(ahdr.uEncoding != FBXCT_UNCOMPRESSED && ahdr.uEncoding != FBXCT_DEFLATE) + { + LogError("Unsupported compression type %u\n", ahdr.uEncoding); + return(false); + } + + switch(prop.cType) + { + case 'f': + pOut = prop.data.pf = new float[prop.uArrSize]; + size = sizeof(*prop.data.pf) * prop.uArrSize; + break; + case 'd': + pOut = prop.data.pd = new double[prop.uArrSize]; + size = sizeof(*prop.data.pd) * prop.uArrSize; + break; + case 'l': + pOut = prop.data.pi64 = new int64_t[prop.uArrSize]; + size = sizeof(*prop.data.pi64) * prop.uArrSize; + break; + case 'i': + pOut = prop.data.pi = new int32_t[prop.uArrSize]; + size = sizeof(*prop.data.pi) * prop.uArrSize; + break; + case 'b': + pOut = prop.data.pb = new uint8_t[prop.uArrSize]; + size = sizeof(*prop.data.pb) * prop.uArrSize; + break; + default: + LogError("Unsupported field type value %c\n", prop.cType); + return(false); + } + + if(ahdr.uEncoding == FBXCT_UNCOMPRESSED) + { + m_pCurrentFile->readBin(pOut, size); + } + + if(ahdr.uEncoding == FBXCT_DEFLATE) + { + aBytes.resizeFast(ahdr.uCompressedLength); + m_pCurrentFile->readBin(aBytes, ahdr.uCompressedLength); + + libdeflate_decompressor *pDecompressor = libdeflate_alloc_decompressor(); + // 2 first bytes is a zlib trash header + libdeflate_result res = libdeflate_deflate_decompress(pDecompressor, aBytes + 2, aBytes.size() - 2, pOut, size, &outSize); + libdeflate_free_decompressor(pDecompressor); + + if(res != LIBDEFLATE_SUCCESS) + { + LogError("libdeflate_deflate_decompress(): returned error %d\n", res); + } + } + } + } + + FBXNodeRecordHeader hdr; + + while(m_pCurrentFile->getPos() < pNode->hdr.uEndOffset) + { + if(m_isOldVer) + { + FBXNodeRecordHeaderOld hdrOld; + m_pCurrentFile->readBin(&hdrOld, sizeof(hdrOld)); + + hdr.u8NameLen = hdrOld.u8NameLen; + hdr.uEndOffset = hdrOld.uEndOffset; + hdr.uNumProperties = hdrOld.uNumProperties; + hdr.uPropertyListLen = hdrOld.uPropertyListLen; + } + else + { + m_pCurrentFile->readBin(&hdr, sizeof(hdr)); + } + + if(isNullNode(&hdr)) + { + break; + } + + FBXNodeRecord &rec = pNode->aNodes[pNode->aNodes.size()]; + rec.hdr = hdr; + if(!readNode(&rec)) + { + return(false); + } + } + + return(true); +} + +bool CModelLoader::isNullNode(FBXNodeRecordHeader *pHdr) +{ + return(pHdr->uPropertyListLen == 0 && pHdr->uNumProperties == 0 && pHdr->uEndOffset == 0 && pHdr->u8NameLen == 0); +} + +void CModelLoader::printNode(FBXNodeRecord *pNode, UINT uLevel) +{ + for(UINT i = 0; i < uLevel; ++i) + { + printf("\t"); + } + + printf("%s:", pNode->szName); + + fora(i, pNode->aProps) + { + const FBXPropertyRecord &prop = pNode->aProps[i]; + + switch(prop.cType) + { + case 'f': + printf(" array<float>(%u)", prop.uArrSize); + break; + case 'd': + printf(" array<double>(%u)", prop.uArrSize); + break; + case 'l': + printf(" array<int64>(%u)", prop.uArrSize); + break; + case 'i': + printf(" array<int32>(%u)", prop.uArrSize); + break; + case 'b': + printf(" array<uint8>(%u)", prop.uArrSize); + break; + case 'S': + printf(" \"%s\"", prop.data.pc); + break; + case 'R': + printf(" array<byte>(%u)", prop.uArrSize); + break; + case 'Y': + printf(" Y:%hd", prop.data.i16); + break; + case 'C': + printf(" C:%s", (prop.data.b ? "true" : "false")); + break; + case 'I': + printf(" I:%d", prop.data.i); + break; + case 'F': + printf(" F:%f", prop.data.f); + break; + case 'D': + printf(" D:%f", prop.data.d); + break; + case 'L': + printf(" L:%ld", prop.data.i64); + break; + default: + break; + } + } + + printf("\n"); + + fora(i, pNode->aNodes) + { + printNode(&pNode->aNodes[i], uLevel + 1); + } +} + +void CModelLoader::getNodeByName(const char *szName, Array<FBXNodeRecord*> *pArray, FBXNodeRecord *pNode) +{ + if(!pNode) + { + pNode = &m_rootNode; + } + + fora(i, pNode->aNodes) + { + getNodeByName(szName, pArray, &pNode->aNodes[i]); + } + + if(!strcmp(pNode->szName, szName)) + { + pArray->push_back(pNode); + } +} + +FBXNodeRecord *CModelLoader::getChildNode(const char *szName, const FBXNodeRecord *pNode, UINT uIndex) +{ + assert(pNode->aProps[0].cType == 'L'); + + int64_t id = pNode->aProps[0].data.i64; + + Array<FBXNodeRecord*> array; + + getNodeByName(szName, &array); + + if(!array.size()) + { + return(NULL); + } + + fora(i, m_rootNode.aNodes) + { + if(!strcmp(m_rootNode.aNodes[i].szName, "Connections")) + { + fora(j, m_rootNode.aNodes[i].aNodes) + { + FBXNodeRecord &node = m_rootNode.aNodes[i].aNodes[j]; + + assert(node.aProps[2].cType == 'L'); + + if(node.aProps[2].data.i64 == id) + { + fora(k, array) + { + assert(node.aProps[1].cType == 'L'); + assert(array[k]->aProps[0].cType == 'L'); + + if(array[k]->aProps[0].data.i64 == node.aProps[1].data.i64) + { + if(uIndex-- == 0) + { + return(array[k]); + } + } + } + } + } + } + } + return(NULL); +} + +UINT CModelLoader::getCountIndexes(int32_t *pIndexes, uint32_t uSize) +{ + UINT uResult = 0; + int32_t *pIterator = pIndexes; + + Array<int32_t> array; + + while(pIterator < pIndexes + uSize) + { + array.push_back(*pIterator); + + if(*pIterator < 0) + { + array[array.size() - 1] = -(*pIterator) - 1; + + //assert(array.size() == 3 || array.size() == 4); + + fora(i, array) + { + if(array[i] == array[(i + 1) % il]) + { + array.erase(i); + break; + } + } + + uResult += (array.size() - 2) * 3; + + array.clearFast(); + } + ++pIterator; + } + return(uResult); +} + +void CModelLoader::getVertices(const VerticesData &verticesData, Array<Array<XResourceModelStaticVertex>> *pOutArray, Array<Array<uint32_t>> *pOutIndices) +{ + struct Index + { + int32_t iValue; + UINT uIndex; + }; + + XResourceModelStaticVertex vtx = {}; + const int32_t *pIndexes = verticesData.pIndexes; + const uint32_t uSize = verticesData.uSizeIndexes; + const int32_t *pIterator = pIndexes; + const double *pVerteces = verticesData.pVerteces; + + uint32_t uStartVertex = 0; + + Array<Index> array; + + Array<UINT> uUVIndex; + Array<UINT> uNormalsIndex; + UINT uPolyCounter = 0; + UINT uMaterialIndex = 0; + + SMMATRIX matrix; + + if(verticesData.pMatrixNode) + { + double *m = verticesData.pMatrixNode->aProps[0].data.pd; + + matrix = SMMATRIX(m[0], m[1], m[2], m[3], + m[4], m[5], m[6], m[7], + m[8], m[9], m[10], m[11], + m[12], m[13], m[14], m[15]); + + float fDet = SMMatrixDeterminant(matrix); + if(fDet < 0.0f) + { + matrix.r[0] = -matrix.r[0]; + } + } + + matrix = matrix * verticesData.qPreRotation.Conjugate().GetMatrix(); + + while(pIterator < pIndexes + uSize) + { + array.push_back({*pIterator, pIterator - pIndexes}); + + if(*pIterator < 0) + { + array[array.size() - 1].iValue = -(*pIterator) - 1; + + fora(i, array) + { + if(array[i].iValue == array[(i + 1) % il].iValue) + { + array.erase(i); + break; + } + } + + fora(i, array) + { + switch(verticesData.uvMappingInfoType) + { + case FBX_ByPolygonVertex: + uUVIndex[i] = array[i].uIndex; + break; + case FBX_ByVertex: + uUVIndex[i] = array[i].iValue; + break; + default: + assert(!"Unsupported maping information type"); + break; + } + + switch(verticesData.uvRefInfoType) + { + case FBX_Direct: + // No need actions + break; + case FBX_IndexToDirect: + uUVIndex[i] = verticesData.pUVIndexes[uUVIndex[i]]; + break; + + default: + assert(!"Unsupported reference information type"); + break; + } + + switch(verticesData.normalMappingInfoType) + { + case FBX_ByPolygon: + uNormalsIndex[i] = uPolyCounter; + break; + case FBX_ByPolygonVertex: + uNormalsIndex[i] = array[i].uIndex; + break; + case FBX_ByVertex: + uNormalsIndex[i] = array[i].iValue; + break; + default: + assert(!"Unsupported maping information type"); + break; + } + + switch(verticesData.normalRefInfoType) + { + case FBX_Direct: + // No need actions + break; + case FBX_IndexToDirect: + uNormalsIndex[i] = verticesData.pNormalIndexes[uNormalsIndex[i]]; + break; + + default: + assert(!"Unsupported reference information type"); + break; + } + } + + switch(verticesData.materialMappingInfoType) + { + case FBX_ByPolygon: + uMaterialIndex = uPolyCounter; + break; + case FBX_AllSame: + uMaterialIndex = 0; + break; + default: + assert(!"Unsupported maping information type"); + break; + } + + switch(verticesData.materialRefInfoType) + { + case FBX_Direct: + // No need actions + break; + case FBX_IndexToDirect: + uMaterialIndex = verticesData.pMaterials[uMaterialIndex]; + break; + + default: + assert(!"Unsupported reference information type"); + break; + } + + uMaterialIndex = (*verticesData.pMaterialMapArray)[uMaterialIndex]; + + uStartVertex = (*pOutArray)[uMaterialIndex].size(); + fora(i, array) + { + vtx.vPos = matrix * float3_t(pVerteces[array[i].iValue * 3], pVerteces[array[i].iValue * 3 + 1], pVerteces[array[i].iValue * 3 + 2]) / 100.0f * m_fUnitScaleFactor; + vtx.vTex = float2_t(verticesData.pUV[uUVIndex[i] * 2], 1.0f - verticesData.pUV[uUVIndex[i] * 2 + 1]); + vtx.vNorm = float3_t(verticesData.pNormal[uNormalsIndex[i] * 3], verticesData.pNormal[uNormalsIndex[i] * 3 + 1], verticesData.pNormal[uNormalsIndex[i] * 3 + 2]); + (*pOutArray)[uMaterialIndex].push_back(vtx); + } + + // (0-5) 012 023 034 045 ... + for(UINT i = 1, l = array.size() - 1; i < l; ++i) + { + (*pOutIndices)[uMaterialIndex].push_back(uStartVertex); + (*pOutIndices)[uMaterialIndex].push_back(uStartVertex + i); + (*pOutIndices)[uMaterialIndex].push_back(uStartVertex + i + 1); + } + + array.clearFast(); + ++uPolyCounter; + } + ++pIterator; + } +} \ No newline at end of file diff --git a/source/fbxplugin/ModelLoader.h b/source/fbxplugin/ModelLoader.h new file mode 100644 index 0000000000000000000000000000000000000000..d8a81986c336b7501608775e923852a87f5ad248 --- /dev/null +++ b/source/fbxplugin/ModelLoader.h @@ -0,0 +1,88 @@ +#ifndef __MODELLOADER_H +#define __MODELLOADER_H + +#include <xcommon/IXModelLoader.h> +#include <xcommon/IFileSystem.h> +#include "fbx.h" + +class CModelLoader final: public IXUnknownImplementation<IXModelLoader> +{ +public: + CModelLoader(IFileSystem *pFileSystem); + XIMPLEMENT_VERSION(IXMODELLOADER_VERSION); + + UINT XMETHODCALLTYPE getExtCount() const override; + const char* XMETHODCALLTYPE getExt(UINT uIndex) const override; + const char* XMETHODCALLTYPE getExtText(UINT uIndex) const override; + const char* XMETHODCALLTYPE getAuthor() const override; + const char* XMETHODCALLTYPE getCopyright() const override; + const char* XMETHODCALLTYPE getDescription() const override; + + bool XMETHODCALLTYPE open(const char *szFileName) override; + XMODELTYPE XMETHODCALLTYPE getType() const override; + bool XMETHODCALLTYPE loadAsStatic(IXResourceModelStatic *pResource) override; + bool XMETHODCALLTYPE loadAsAnimated(IXResourceModelAnimated *pResource) override; + void XMETHODCALLTYPE getInfo(XModelInfo *pModelInfo) override; + void XMETHODCALLTYPE close() override; + + + bool loadGeneric(IXResourceModel *pResource); + +private: + bool readNode(FBXNodeRecord *pNode); + bool isNullNode(FBXNodeRecordHeader *pHdr); + void printNode(FBXNodeRecord *pNode, UINT uLevel); + void getNodeByName(const char *szName, Array<FBXNodeRecord*> *pArray, FBXNodeRecord *pNode = NULL); + + FBXNodeRecord *getChildNode(const char *szName, const FBXNodeRecord *pNode, UINT uIndex = 0); + UINT getCountIndexes(int32_t *pIndexes, uint32_t uSize); + + struct VerticesData + { + int32_t *pIndexes; + uint32_t uSizeIndexes; + + double *pVerteces; + + FBX_MAPPING_INFORMATION_TYPE uvMappingInfoType; + FBX_REFERENCE_INFORMATION_TYPE uvRefInfoType; + + double *pUV; + uint32_t uSizeUV; + + int32_t *pUVIndexes; + uint32_t uSizeUVIndexes; + + double *pNormal; + int32_t *pNormalIndexes; + + FBX_MAPPING_INFORMATION_TYPE normalMappingInfoType; + FBX_REFERENCE_INFORMATION_TYPE normalRefInfoType; + + int32_t *pMaterials; + + FBX_MAPPING_INFORMATION_TYPE materialMappingInfoType; + FBX_REFERENCE_INFORMATION_TYPE materialRefInfoType; + + const Array<UINT> *pMaterialMapArray; + + FBXNodeRecord *pMatrixNode; + + SMQuaternion qPreRotation; + }; + + void getVertices(const VerticesData &verticesData, Array<Array<XResourceModelStaticVertex>> *pOutArray, Array<Array<uint32_t>> *pOutIndices); + + IFileSystem *m_pFileSystem; + IFile *m_pCurrentFile = NULL; + + float m_fUnitScaleFactor = 1.0f; + + String m_sFileName; + + bool m_isOldVer = false; + + FBXNodeRecord m_rootNode; +}; + +#endif diff --git a/source/fbxplugin/dllmain.cpp b/source/fbxplugin/dllmain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..955d268b01c7c34e230e19cda81db4babeee39ba --- /dev/null +++ b/source/fbxplugin/dllmain.cpp @@ -0,0 +1,24 @@ + +/*********************************************************** +Copyright © Ivan Dokunov, Evgeny Danilovich, 2025 +See the license in LICENSE +***********************************************************/ + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +BOOL APIENTRY DllMain(HMODULE hModule, + DWORD ul_reason_for_call, + LPVOID lpReserved + ) +{ + switch(ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + case DLL_THREAD_ATTACH: + case DLL_THREAD_DETACH: + case DLL_PROCESS_DETACH: + break; + } + return TRUE; +} diff --git a/source/fbxplugin/fbx.h b/source/fbxplugin/fbx.h new file mode 100644 index 0000000000000000000000000000000000000000..cadf5911665560dc6408ac38cb6f46d3f4afb2df --- /dev/null +++ b/source/fbxplugin/fbx.h @@ -0,0 +1,126 @@ +#ifndef __FBX_H +#define __FBX_H + +#include <gdefines.h> + +#define FBX_MAGIC "Kaydara FBX Binary " +#define FBX_VERSION 7500 + +enum FBX_COMPRESSION_TYPE +{ + FBXCT_UNCOMPRESSED = 0, + FBXCT_DEFLATE = 1 +}; + +XENUM(FBX_MAPPING_INFORMATION_TYPE, + FBX_ByPolygon, + FBX_ByPolygonVertex, + FBX_ByVertex, + FBX_ByVertice = FBX_ByVertex, + FBX_ByEdge, + FBX_AllSame +); + +XENUM(FBX_REFERENCE_INFORMATION_TYPE, + FBX_Direct, + FBX_IndexToDirect, + FBX_Index = FBX_IndexToDirect +); + +#pragma pack(push, 1) + +struct FBXHeader +{ + char szMagic[21]; + uint16_t _unknown; + uint32_t uVersion; +}; + +struct FBXNodeRecordHeaderOld +{ + uint32_t uEndOffset; + uint32_t uNumProperties; + uint32_t uPropertyListLen; + uint8_t u8NameLen; +}; + +struct FBXNodeRecordHeader +{ + uint64_t uEndOffset; + uint64_t uNumProperties; + uint64_t uPropertyListLen; + uint8_t u8NameLen; +}; + +struct FBXPropertyRecord +{ + char cType; + union + { + int16_t i16; + uint8_t b; + int32_t i; + float f; + double d; + int64_t i64; + + float *pf; + double *pd; + int64_t *pi64; + int32_t *pi; + uint8_t *pb; + + char *pc; + byte *pRaw; + } data; + uint32_t uArrSize; + + ~FBXPropertyRecord() + { + switch(cType) + { + case 'f': + mem_delete_a(data.pf); + break; + case 'd': + mem_delete_a(data.pd); + break; + case 'l': + mem_delete_a(data.pi64); + break; + case 'i': + mem_delete_a(data.pi); + break; + case 'b': + mem_delete_a(data.pb); + break; + case 'S': + mem_delete_a(data.pc); + break; + case 'R': + mem_delete_a(data.pRaw); + break; + default: + break; + } + } +}; + +struct FBXArrayHeader +{ + uint32_t uLength; + uint32_t uEncoding; + uint32_t uCompressedLength; +}; + +struct FBXNodeRecord +{ + FBXNodeRecordHeader hdr; + char szName[256]; + Array<FBXPropertyRecord> aProps; + Array<FBXNodeRecord> aNodes; +}; + +#pragma pack(pop) + +#endif diff --git a/source/fbxplugin/libdeflate.c b/source/fbxplugin/libdeflate.c new file mode 100644 index 0000000000000000000000000000000000000000..e421d7911345c57cee506a375d2950a1d552643c --- /dev/null +++ b/source/fbxplugin/libdeflate.c @@ -0,0 +1,4193 @@ +// ofbx changes : removed unused code, single .h and .c +/* + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * This is a highly optimized DEFLATE decompressor. It is much faster than + * vanilla zlib, typically well over twice as fast, though results vary by CPU. + * + * Why this is faster than vanilla zlib: + * + * - Word accesses rather than byte accesses when reading input + * - Word accesses rather than byte accesses when copying matches + * - Faster Huffman decoding combined with various DEFLATE-specific tricks + * - Larger bitbuffer variable that doesn't need to be refilled as often + * - Other optimizations to remove unnecessary branches + * - Only full-buffer decompression is supported, so the code doesn't need to + * support stopping and resuming decompression. + * - On x86_64, a version of the decompression routine is compiled with BMI2 + * instructions enabled and is used automatically at runtime when supported. + */ + +/* + * lib_common.h - internal header included by all library code + */ + +#ifndef LIB_LIB_COMMON_H +#define LIB_LIB_COMMON_H + +#ifdef LIBDEFLATE_H + /* + * When building the library, LIBDEFLATEAPI needs to be defined properly before + * including libdeflate.h. + */ +# error "lib_common.h must always be included before libdeflate.h" +#endif + +#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATE_EXPORT_SYM __declspec(dllexport) +#elif defined(__GNUC__) +# define LIBDEFLATE_EXPORT_SYM __attribute__((visibility("default"))) +#else +# define LIBDEFLATE_EXPORT_SYM +#endif + +/* + * On i386, gcc assumes that the stack is 16-byte aligned at function entry. + * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi) + * only guarantee 4-byte alignment when calling functions. This is mainly an + * issue on Windows, but it has been seen on Linux too. Work around this ABI + * incompatibility by realigning the stack pointer when entering libdeflate. + * This prevents crashes in SSE/AVX code. + */ +#if defined(__GNUC__) && defined(__i386__) +# define LIBDEFLATE_ALIGN_STACK __attribute__((force_align_arg_pointer)) +#else +# define LIBDEFLATE_ALIGN_STACK +#endif + +#define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK + +/* + * common_defs.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef COMMON_DEFS_H +#define COMMON_DEFS_H + +#include "libdeflate.h" + +#include <stdbool.h> +#include <stddef.h> /* for size_t */ +#include <stdint.h> +#ifdef _MSC_VER +# include <intrin.h> /* for _BitScan*() and other intrinsics */ +# include <stdlib.h> /* for _byteswap_*() */ + /* Disable MSVC warnings that are expected. */ + /* /W2 */ +# pragma warning(disable : 4146) /* unary minus on unsigned type */ + /* /W3 */ +# pragma warning(disable : 4018) /* signed/unsigned mismatch */ +# pragma warning(disable : 4244) /* possible loss of data */ +# pragma warning(disable : 4267) /* possible loss of precision */ +# pragma warning(disable : 4310) /* cast truncates constant value */ + /* /W4 */ +# pragma warning(disable : 4100) /* unreferenced formal parameter */ +# pragma warning(disable : 4127) /* conditional expression is constant */ +# pragma warning(disable : 4189) /* local variable initialized but not referenced */ +# pragma warning(disable : 4232) /* nonstandard extension used */ +# pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */ +# pragma warning(disable : 4295) /* array too small to include terminating null */ +#endif +#ifndef FREESTANDING +# include <string.h> /* for memcpy() */ +#endif + +/* ========================================================================== */ +/* Target architecture */ +/* ========================================================================== */ + +/* If possible, define a compiler-independent ARCH_* macro. */ +#undef ARCH_X86_64 +#undef ARCH_X86_32 +#undef ARCH_ARM64 +#undef ARCH_ARM32 +#ifdef _MSC_VER +# if defined(_M_X64) +# define ARCH_X86_64 +# elif defined(_M_IX86) +# define ARCH_X86_32 +# elif defined(_M_ARM64) +# define ARCH_ARM64 +# elif defined(_M_ARM) +# define ARCH_ARM32 +# endif +#else +# if defined(__x86_64__) +# define ARCH_X86_64 +# elif defined(__i386__) +# define ARCH_X86_32 +# elif defined(__aarch64__) +# define ARCH_ARM64 +# elif defined(__arm__) +# define ARCH_ARM32 +# endif +#endif + +/* ========================================================================== */ +/* Type definitions */ +/* ========================================================================== */ + +/* Fixed-width integer types */ +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +/* ssize_t, if not available in <sys/types.h> */ +#ifdef _MSC_VER +# ifdef _WIN64 + typedef long long ssize_t; +# else + typedef long ssize_t; +# endif +#endif + +/* + * Word type of the target architecture. Use 'size_t' instead of + * 'unsigned long' to account for platforms such as Windows that use 32-bit + * 'unsigned long' on 64-bit architectures. + */ +typedef size_t machine_word_t; + +/* Number of bytes in a word */ +#define WORDBYTES ((int)sizeof(machine_word_t)) + +/* Number of bits in a word */ +#define WORDBITS (8 * WORDBYTES) + +/* ========================================================================== */ +/* Optional compiler features */ +/* ========================================================================== */ + +/* Compiler version checks. Only use when absolutely necessary. */ +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +# define GCC_PREREQ(major, minor) \ + (__GNUC__ > (major) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define GCC_PREREQ(major, minor) 0 +#endif +#ifdef __clang__ +# ifdef __apple_build_version__ +# define CLANG_PREREQ(major, minor, apple_version) \ + (__apple_build_version__ >= (apple_version)) +# else +# define CLANG_PREREQ(major, minor, apple_version) \ + (__clang_major__ > (major) || \ + (__clang_major__ == (major) && __clang_minor__ >= (minor))) +# endif +#else +# define CLANG_PREREQ(major, minor, apple_version) 0 +#endif + +/* + * Macros to check for compiler support for attributes and builtins. clang + * implements these macros, but gcc doesn't, so generally any use of one of + * these macros must also be combined with a gcc version check. + */ +#ifndef __has_attribute +# define __has_attribute(attribute) 0 +#endif +#ifndef __has_builtin +# define __has_builtin(builtin) 0 +#endif + +/* inline - suggest that a function be inlined */ +#ifdef _MSC_VER +# define inline __inline +#endif /* else assume 'inline' is usable as-is */ + +/* forceinline - force a function to be inlined, if possible */ +#if defined(__GNUC__) || __has_attribute(always_inline) +# define forceinline inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define forceinline __forceinline +#else +# define forceinline inline +#endif + +/* MAYBE_UNUSED - mark a function or variable as maybe unused */ +#if defined(__GNUC__) || __has_attribute(unused) +# define MAYBE_UNUSED __attribute__((unused)) +#else +# define MAYBE_UNUSED +#endif + +/* + * restrict - hint that writes only occur through the given pointer. + * + * Don't use MSVC's __restrict, since it has nonstandard behavior. + * Standard restrict is okay, if it is supported. + */ +#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L) +# if defined(__GNUC__) || defined(__clang__) +# define restrict __restrict__ +# else +# define restrict +# endif +#endif /* else assume 'restrict' is usable as-is */ + +/* likely(expr) - hint that an expression is usually true */ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) +# define likely(expr) __builtin_expect(!!(expr), 1) +#else +# define likely(expr) (expr) +#endif + +/* unlikely(expr) - hint that an expression is usually false */ +#if defined(__GNUC__) || __has_builtin(__builtin_expect) +# define unlikely(expr) __builtin_expect(!!(expr), 0) +#else +# define unlikely(expr) (expr) +#endif + +/* prefetchr(addr) - prefetch into L1 cache for read */ +#undef prefetchr +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) +# define prefetchr(addr) __builtin_prefetch((addr), 0) +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchr(addr) _mm_prefetch((addr), _MM_HINT_T0) +# elif defined(ARCH_ARM64) +# define prefetchr(addr) __prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchr(addr) __prefetch(addr) +# endif +#endif +#ifndef prefetchr +# define prefetchr(addr) +#endif + +/* prefetchw(addr) - prefetch into L1 cache for write */ +#undef prefetchw +#if defined(__GNUC__) || __has_builtin(__builtin_prefetch) +# define prefetchw(addr) __builtin_prefetch((addr), 1) +#elif defined(_MSC_VER) +# if defined(ARCH_X86_32) || defined(ARCH_X86_64) +# define prefetchw(addr) _m_prefetchw(addr) +# elif defined(ARCH_ARM64) +# define prefetchw(addr) __prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */) +# elif defined(ARCH_ARM32) +# define prefetchw(addr) __prefetchw(addr) +# endif +#endif +#ifndef prefetchw +# define prefetchw(addr) +#endif + +/* + * _aligned_attribute(n) - declare that the annotated variable, or variables of + * the annotated type, must be aligned on n-byte boundaries. + */ +#undef _aligned_attribute +#if defined(__GNUC__) || __has_attribute(aligned) +# define _aligned_attribute(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +# define _aligned_attribute(n) __declspec(align(n)) +#endif + +/* + * _target_attribute(attrs) - override the compilation target for a function. + * + * This accepts one or more comma-separated suffixes to the -m prefix jointly + * forming the name of a machine-dependent option. On gcc-like compilers, this + * enables codegen for the given targets, including arbitrary compiler-generated + * code as well as the corresponding intrinsics. On other compilers this macro + * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway. + */ +#if GCC_PREREQ(4, 4) || __has_attribute(target) +# define _target_attribute(attrs) __attribute__((target(attrs))) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 1 +#else +# define _target_attribute(attrs) +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 +#endif + +/* ========================================================================== */ +/* Miscellaneous macros */ +/* ========================================================================== */ + +#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) +#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1)) +#define ROUND_UP(n, d) ((d) * DIV_ROUND_UP((n), (d))) + +/* ========================================================================== */ +/* Endianness handling */ +/* ========================================================================== */ + +/* + * CPU_IS_LITTLE_ENDIAN() - 1 if the CPU is little endian, or 0 if it is big + * endian. When possible this is a compile-time macro that can be used in + * preprocessor conditionals. As a fallback, a generic method is used that + * can't be used in preprocessor conditionals but should still be optimized out. + */ +#if defined(__BYTE_ORDER__) /* gcc v4.6+ and clang */ +# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#elif defined(_MSC_VER) +# define CPU_IS_LITTLE_ENDIAN() true +#else +static forceinline bool CPU_IS_LITTLE_ENDIAN(void) +{ + union { + u32 w; + u8 b; + } u; + + u.w = 1; + return u.b; +} +#endif + +/* bswap16(v) - swap the bytes of a 16-bit integer */ +static forceinline u16 bswap16(u16 v) +{ +#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) + return __builtin_bswap16(v); +#elif defined(_MSC_VER) + return _byteswap_ushort(v); +#else + return (v << 8) | (v >> 8); +#endif +} + +/* bswap32(v) - swap the bytes of a 32-bit integer */ +static forceinline u32 bswap32(u32 v) +{ +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) + return _byteswap_ulong(v); +#else + return ((v & 0x000000FF) << 24) | + ((v & 0x0000FF00) << 8) | + ((v & 0x00FF0000) >> 8) | + ((v & 0xFF000000) >> 24); +#endif +} + +/* bswap64(v) - swap the bytes of a 64-bit integer */ +static forceinline u64 bswap64(u64 v) +{ +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) + return _byteswap_uint64(v); +#else + return ((v & 0x00000000000000FF) << 56) | + ((v & 0x000000000000FF00) << 40) | + ((v & 0x0000000000FF0000) << 24) | + ((v & 0x00000000FF000000) << 8) | + ((v & 0x000000FF00000000) >> 8) | + ((v & 0x0000FF0000000000) >> 24) | + ((v & 0x00FF000000000000) >> 40) | + ((v & 0xFF00000000000000) >> 56); +#endif +} + +#define le16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap16(v)) +#define le32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap32(v)) +#define le64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap64(v)) +#define be16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap16(v) : (v)) +#define be32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap32(v) : (v)) +#define be64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap64(v) : (v)) + +/* ========================================================================== */ +/* Unaligned memory accesses */ +/* ========================================================================== */ + +/* + * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed + * efficiently on the target platform, otherwise 0. + */ +#if (defined(__GNUC__) || defined(__clang__)) && \ + (defined(ARCH_X86_64) || defined(ARCH_X86_32) || \ + defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \ + /* + * For all compilation purposes, WebAssembly behaves like any other CPU + * instruction set. Even though WebAssembly engine might be running on + * top of different actual CPU architectures, the WebAssembly spec + * itself permits unaligned access and it will be fast on most of those + * platforms, and simulated at the engine level on others, so it's + * worth treating it as a CPU architecture with fast unaligned access. + */ defined(__wasm__)) +# define UNALIGNED_ACCESS_IS_FAST 1 +#elif defined(_MSC_VER) +# define UNALIGNED_ACCESS_IS_FAST 1 +#else +# define UNALIGNED_ACCESS_IS_FAST 0 +#endif + +/* + * Implementing unaligned memory accesses using memcpy() is portable, and it + * usually gets optimized appropriately by modern compilers. I.e., each + * memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled to a load or store + * instruction, not to an actual function call. + * + * We no longer use the "packed struct" approach to unaligned accesses, as that + * is nonstandard, has unclear semantics, and doesn't receive enough testing + * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994). + * + * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception + * where memcpy() generates inefficient code + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer + * consider that one case important enough to maintain different code for. + * If you run into it, please just use a newer version of gcc (or use clang). + */ + +#ifdef FREESTANDING +# define MEMCOPY __builtin_memcpy +#else +# define MEMCOPY memcpy +#endif + +/* Unaligned loads and stores without endianness conversion */ + +#define DEFINE_UNALIGNED_TYPE(type) \ +static forceinline type \ +load_##type##_unaligned(const void *p) \ +{ \ + type v; \ + \ + MEMCOPY(&v, p, sizeof(v)); \ + return v; \ +} \ + \ +static forceinline void \ +store_##type##_unaligned(type v, void *p) \ +{ \ + MEMCOPY(p, &v, sizeof(v)); \ +} + +DEFINE_UNALIGNED_TYPE(u16) +DEFINE_UNALIGNED_TYPE(u32) +DEFINE_UNALIGNED_TYPE(u64) +DEFINE_UNALIGNED_TYPE(machine_word_t) + +#undef MEMCOPY + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +/* Unaligned loads with endianness conversion */ + +static forceinline u16 +get_unaligned_le16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[1] << 8) | p[0]; +} + +static forceinline u16 +get_unaligned_be16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[0] << 8) | p[1]; +} + +static forceinline u32 +get_unaligned_le32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[3] << 24) | ((u32)p[2] << 16) | + ((u32)p[1] << 8) | p[0]; +} + +static forceinline u32 +get_unaligned_be32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[0] << 24) | ((u32)p[1] << 16) | + ((u32)p[2] << 8) | p[3]; +} + +static forceinline u64 +get_unaligned_le64(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le64_bswap(load_u64_unaligned(p)); + else + return ((u64)p[7] << 56) | ((u64)p[6] << 48) | + ((u64)p[5] << 40) | ((u64)p[4] << 32) | + ((u64)p[3] << 24) | ((u64)p[2] << 16) | + ((u64)p[1] << 8) | p[0]; +} + +static forceinline machine_word_t +get_unaligned_leword(const u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return get_unaligned_le32(p); + else + return get_unaligned_le64(p); +} + +/* Unaligned stores with endianness conversion */ + +static forceinline void +put_unaligned_le16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(le16_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + } +} + +static forceinline void +put_unaligned_be16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(be16_bswap(v), p); + } else { + p[0] = (u8)(v >> 8); + p[1] = (u8)(v >> 0); + } +} + +static forceinline void +put_unaligned_le32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(le32_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + } +} + +static forceinline void +put_unaligned_be32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(be32_bswap(v), p); + } else { + p[0] = (u8)(v >> 24); + p[1] = (u8)(v >> 16); + p[2] = (u8)(v >> 8); + p[3] = (u8)(v >> 0); + } +} + +static forceinline void +put_unaligned_le64(u64 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u64_unaligned(le64_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + p[4] = (u8)(v >> 32); + p[5] = (u8)(v >> 40); + p[6] = (u8)(v >> 48); + p[7] = (u8)(v >> 56); + } +} + +static forceinline void +put_unaligned_leword(machine_word_t v, u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + put_unaligned_le32(v, p); + else + put_unaligned_le64(v, p); +} + +/* ========================================================================== */ +/* Bit manipulation functions */ +/* ========================================================================== */ + +/* + * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least + * significant end) of the *most* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static forceinline unsigned +bsr32(u32 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_clz) + return 31 - __builtin_clz(v); +#elif defined(_MSC_VER) + unsigned long i; + + _BitScanReverse(&i, v); + return i; +#else + unsigned i = 0; + + while ((v >>= 1) != 0) + i++; + return i; +#endif +} + +static forceinline unsigned +bsr64(u64 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_clzll) + return 63 - __builtin_clzll(v); +#elif defined(_MSC_VER) && defined(_WIN64) + unsigned long i; + + _BitScanReverse64(&i, v); + return i; +#else + unsigned i = 0; + + while ((v >>= 1) != 0) + i++; + return i; +#endif +} + +static forceinline unsigned +bsrw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsr32(v); + else + return bsr64(v); +} + +/* + * Bit Scan Forward (BSF) - find the 0-based index (relative to the least + * significant end) of the *least* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static forceinline unsigned +bsf32(u32 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_ctz) + return __builtin_ctz(v); +#elif defined(_MSC_VER) + unsigned long i; + + _BitScanForward(&i, v); + return i; +#else + unsigned i = 0; + + for (; (v & 1) == 0; v >>= 1) + i++; + return i; +#endif +} + +static forceinline unsigned +bsf64(u64 v) +{ +#if defined(__GNUC__) || __has_builtin(__builtin_ctzll) + return __builtin_ctzll(v); +#elif defined(_MSC_VER) && defined(_WIN64) + unsigned long i; + + _BitScanForward64(&i, v); + return i; +#else + unsigned i = 0; + + for (; (v & 1) == 0; v >>= 1) + i++; + return i; +#endif +} + +static forceinline unsigned +bsfw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsf32(v); + else + return bsf64(v); +} + +/* + * rbit32(v): reverse the bits in a 32-bit integer. This doesn't have a + * fallback implementation; use '#ifdef rbit32' to check if this is available. + */ +#undef rbit32 +#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \ + (__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__))) +static forceinline u32 +rbit32(u32 v) +{ + __asm__("rbit %0, %1" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64) +static forceinline u32 +rbit32(u32 v) +{ + __asm__("rbit %w0, %w1" : "=r" (v) : "r" (v)); + return v; +} +#define rbit32 rbit32 +#endif + +#endif /* COMMON_DEFS_H */ + + +typedef void *(*malloc_func_t)(size_t); +typedef void (*free_func_t)(void *); + +extern malloc_func_t libdeflate_default_malloc_func; +extern free_func_t libdeflate_default_free_func; + +void *libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size); +void libdeflate_aligned_free(free_func_t free_func, void *ptr); + +#ifdef FREESTANDING +/* + * With -ffreestanding, <string.h> may be missing, and we must provide + * implementations of memset(), memcpy(), memmove(), and memcmp(). + * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html + * + * Also, -ffreestanding disables interpreting calls to these functions as + * built-ins. E.g., calling memcpy(&v, p, WORDBYTES) will make a function call, + * not be optimized to a single load instruction. For performance reasons we + * don't want that. So, declare these functions as macros that expand to the + * corresponding built-ins. This approach is recommended in the gcc man page. + * We still need the actual function definitions in case gcc calls them. + */ +void *memset(void *s, int c, size_t n); +#define memset(s, c, n) __builtin_memset((s), (c), (n)) + +void *memcpy(void *dest, const void *src, size_t n); +#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) + +void *memmove(void *dest, const void *src, size_t n); +#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) + +int memcmp(const void *s1, const void *s2, size_t n); +#define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) + +#undef LIBDEFLATE_ENABLE_ASSERTIONS +#else +#include <string.h> +#endif + +/* + * Runtime assertion support. Don't enable this in production builds; it may + * hurt performance significantly. + */ +#ifdef LIBDEFLATE_ENABLE_ASSERTIONS +void libdeflate_assertion_failed(const char *expr, const char *file, int line); +#define ASSERT(expr) { if (unlikely(!(expr))) \ + libdeflate_assertion_failed(#expr, __FILE__, __LINE__); } +#else +#define ASSERT(expr) (void)(expr) +#endif + +#define CONCAT_IMPL(a, b) a##b +#define CONCAT(a, b) CONCAT_IMPL(a, b) +#define ADD_SUFFIX(name) CONCAT(name, SUFFIX) + +#endif /* LIB_LIB_COMMON_H */ + +/* + * deflate_constants.h - constants for the DEFLATE compression format + */ + +#ifndef LIB_DEFLATE_CONSTANTS_H +#define LIB_DEFLATE_CONSTANTS_H + +/* Valid block types */ +#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 +#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 +#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 + +/* Minimum and maximum supported match lengths (in bytes) */ +#define DEFLATE_MIN_MATCH_LEN 3 +#define DEFLATE_MAX_MATCH_LEN 258 + +/* Maximum supported match offset (in bytes) */ +#define DEFLATE_MAX_MATCH_OFFSET 32768 + +/* log2 of DEFLATE_MAX_MATCH_OFFSET */ +#define DEFLATE_WINDOW_ORDER 15 + +/* Number of symbols in each Huffman code. Note: for the literal/length + * and offset codes, these are actually the maximum values; a given block + * might use fewer symbols. */ +#define DEFLATE_NUM_PRECODE_SYMS 19 +#define DEFLATE_NUM_LITLEN_SYMS 288 +#define DEFLATE_NUM_OFFSET_SYMS 32 + +/* The maximum number of symbols across all codes */ +#define DEFLATE_MAX_NUM_SYMS 288 + +/* Division of symbols in the literal/length code */ +#define DEFLATE_NUM_LITERALS 256 +#define DEFLATE_END_OF_BLOCK 256 +#define DEFLATE_FIRST_LEN_SYM 257 + +/* Maximum codeword length, in bits, within each Huffman code */ +#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 +#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 +#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 + +/* The maximum codeword length across all codes */ +#define DEFLATE_MAX_CODEWORD_LEN 15 + +/* Maximum possible overrun when decoding codeword lengths */ +#define DEFLATE_MAX_LENS_OVERRUN 137 + +/* + * Maximum number of extra bits that may be required to represent a match + * length or offset. + */ +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 13 + +#endif /* LIB_DEFLATE_CONSTANTS_H */ + +/* + * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c + * + * Copyright 2020 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_CPU_FEATURES_COMMON_H +#define LIB_CPU_FEATURES_COMMON_H + +#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) + /* for strdup() and strtok_r() */ +# undef _ANSI_SOURCE +# ifndef __APPLE__ +# undef _GNU_SOURCE +# define _GNU_SOURCE +# endif +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +#endif + +struct cpu_feature { + u32 bit; + const char *name; +}; + +#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) +/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */ +static inline void +disable_cpu_features_for_testing(u32 *features, + const struct cpu_feature *feature_table, + size_t feature_table_length) +{ + char *env_value, *strbuf, *p, *saveptr = NULL; + size_t i; + + env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES"); + if (!env_value) + return; + strbuf = strdup(env_value); + if (!strbuf) + abort(); + p = strtok_r(strbuf, ",", &saveptr); + while (p) { + for (i = 0; i < feature_table_length; i++) { + if (strcmp(p, feature_table[i].name) == 0) { + *features &= ~feature_table[i].bit; + break; + } + } + if (i == feature_table_length) { + fprintf(stderr, + "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n", + p); + abort(); + } + p = strtok_r(NULL, ",", &saveptr); + } + free(strbuf); +} +#else /* TEST_SUPPORT__DO_NOT_USE */ +static inline void +disable_cpu_features_for_testing(u32 *features, + const struct cpu_feature *feature_table, + size_t feature_table_length) +{ +} +#endif /* !TEST_SUPPORT__DO_NOT_USE */ + +#endif /* LIB_CPU_FEATURES_COMMON_H */ + +/* + * x86/cpu_features.h - feature detection for x86 CPUs + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_CPU_FEATURES_H +#define LIB_X86_CPU_FEATURES_H + +#define HAVE_DYNAMIC_X86_CPU_FEATURES 0 + +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) + +#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER) +# undef HAVE_DYNAMIC_X86_CPU_FEATURES +# define HAVE_DYNAMIC_X86_CPU_FEATURES 1 +#endif + +#define X86_CPU_FEATURE_SSE2 0x00000001 +#define X86_CPU_FEATURE_PCLMUL 0x00000002 +#define X86_CPU_FEATURE_AVX 0x00000004 +#define X86_CPU_FEATURE_AVX2 0x00000008 +#define X86_CPU_FEATURE_BMI2 0x00000010 + +#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2)) +#define HAVE_PCLMUL(features) (HAVE_PCLMUL_NATIVE || ((features) & X86_CPU_FEATURE_PCLMUL)) +#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX)) +#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2)) +#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2)) + +#if HAVE_DYNAMIC_X86_CPU_FEATURES +#define X86_CPU_FEATURES_KNOWN 0x80000000 +extern volatile u32 libdeflate_x86_cpu_features; + +void libdeflate_init_x86_cpu_features(void); + +static inline u32 get_x86_cpu_features(void) +{ + if (libdeflate_x86_cpu_features == 0) + libdeflate_init_x86_cpu_features(); + return libdeflate_x86_cpu_features; +} +#else /* HAVE_DYNAMIC_X86_CPU_FEATURES */ +static inline u32 get_x86_cpu_features(void) { return 0; } +#endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */ + +/* + * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not + * available in the main target couldn't be used in 'target' attribute + * functions. Unfortunately clang has no feature test macro for this, so we + * have to check its version. + */ +#if HAVE_DYNAMIC_X86_CPU_FEATURES && \ + (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER)) +# define HAVE_TARGET_INTRINSICS 1 +#else +# define HAVE_TARGET_INTRINSICS 0 +#endif + +/* SSE2 */ +#if defined(__SSE2__) || \ + (defined(_MSC_VER) && \ + (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) +# define HAVE_SSE2_NATIVE 1 +#else +# define HAVE_SSE2_NATIVE 0 +#endif +#define HAVE_SSE2_INTRIN (HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS) + +/* PCLMUL */ +#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) +# define HAVE_PCLMUL_NATIVE 1 +#else +# define HAVE_PCLMUL_NATIVE 0 +#endif +#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 4) || CLANG_PREREQ(3, 2, 0) || \ + defined(_MSC_VER))) +# define HAVE_PCLMUL_INTRIN 1 +#else +# define HAVE_PCLMUL_INTRIN 0 +#endif + +/* AVX */ +#ifdef __AVX__ +# define HAVE_AVX_NATIVE 1 +#else +# define HAVE_AVX_NATIVE 0 +#endif +#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 6) || CLANG_PREREQ(3, 0, 0) || \ + defined(_MSC_VER))) +# define HAVE_AVX_INTRIN 1 +#else +# define HAVE_AVX_INTRIN 0 +#endif + +/* AVX2 */ +#ifdef __AVX2__ +# define HAVE_AVX2_NATIVE 1 +#else +# define HAVE_AVX2_NATIVE 0 +#endif +#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER))) +# define HAVE_AVX2_INTRIN 1 +#else +# define HAVE_AVX2_INTRIN 0 +#endif + +/* BMI2 */ +#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) +# define HAVE_BMI2_NATIVE 1 +#else +# define HAVE_BMI2_NATIVE 0 +#endif +#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \ + (GCC_PREREQ(4, 7) || CLANG_PREREQ(3, 1, 0) || \ + defined(_MSC_VER))) +# define HAVE_BMI2_INTRIN 1 +#else +# define HAVE_BMI2_INTRIN 0 +#endif + +#endif /* ARCH_X86_32 || ARCH_X86_64 */ + +#endif /* LIB_X86_CPU_FEATURES_H */ + + +/* + * If the expression passed to SAFETY_CHECK() evaluates to false, then the + * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the + * compressed data is invalid. + * + * Theoretically, these checks could be disabled for specialized applications + * where all input to the decompressor will be trusted. + */ +#if 0 +# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") +# define SAFETY_CHECK(expr) (void)(expr) +#else +# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA +#endif + +/***************************************************************************** + * Input bitstream * + *****************************************************************************/ + +/* + * The state of the "input bitstream" consists of the following variables: + * + * - in_next: a pointer to the next unread byte in the input buffer + * + * - in_end: a pointer to just past the end of the input buffer + * + * - bitbuf: a word-sized variable containing bits that have been read from + * the input buffer or from the implicit appended zero bytes + * + * - bitsleft: the number of bits in 'bitbuf' available to be consumed. + * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually + * contain more bits than this. However, only the bits counted + * by 'bitsleft' can actually be consumed; the rest can only be + * used for preloading. + * + * As a micro-optimization, we allow bits 8 and higher of + * 'bitsleft' to contain garbage. When consuming the bits + * associated with a decode table entry, this allows us to do + * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'. + * On some CPUs, this helps reduce instruction dependencies. + * This does have the disadvantage that 'bitsleft' sometimes + * needs to be cast to 'u8', such as when it's used as a shift + * amount in REFILL_BITS_BRANCHLESS(). But that one happens + * for free since most CPUs ignore high bits in shift amounts. + * + * - overread_count: the total number of implicit appended zero bytes that + * have been loaded into the bitbuffer, including any + * counted by 'bitsleft' and any already consumed + */ + +/* + * The type for the bitbuffer variable ('bitbuf' described above). For best + * performance, this should have size equal to a machine word. + * + * 64-bit platforms have a significant advantage: they get a bigger bitbuffer + * which they don't have to refill as often. + */ +typedef machine_word_t bitbuf_t; +#define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t)) + +/* BITMASK(n) returns a bitmask of length 'n'. */ +#define BITMASK(n) (((bitbuf_t)1 << (n)) - 1) + +/* + * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value + * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if + * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()). + */ +#define MAX_BITSLEFT \ + (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS) + +/* + * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be + * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer. + * Since only whole bytes can be added to 'bitsleft', the worst case is + * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit. + */ +#define CONSUMABLE_NBITS (MAX_BITSLEFT - 7) + +/* + * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed + * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP(). (It is *not* + * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a + * byte-at-a-time refill method near the end of input.) This may exceed the + * number of consumable bits (counted by 'bitsleft'). Any bits not counted in + * 'bitsleft' can only be used for precomputation and cannot be consumed. + */ +#define FASTLOOP_PRELOADABLE_NBITS \ + (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS) + +/* + * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be + * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any + * subsequent consumptions. This is 1 bit if the branchless refill method is + * being used, and 0 bits otherwise. + */ +#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT) + +/* + * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been + * refilled, then it's always possible to consume 'n' bits from it. 'n' should + * be a compile-time constant, to enable compile-time evaluation. + */ +#define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n)) + +/* + * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's + * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to + * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The + * arguments should be compile-time constants to enable compile-time evaluation. + */ +#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \ + (CONSUMABLE_NBITS >= (consume_nbits) && \ + FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits)) + +/* + * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by + * reading the next word from the input buffer and updating 'in_next' and + * 'bitsleft' based on how many bits were refilled -- counting whole bytes only. + * This is much faster than reading a byte at a time, at least if the CPU is + * little endian and supports fast unaligned memory accesses. + * + * The simplest way of branchlessly updating 'bitsleft' would be: + * + * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7; + * + * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than + * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update + * 'bitsleft' by just setting the bits above the low 3 bits: + * + * bitsleft |= MAX_BITSLEFT & ~7; + * + * That compiles down to a single instruction like 'or $0x38, %rbp'. Using + * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be + * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior. + * + * The simplest way of branchlessly updating 'in_next' would be: + * + * in_next += (MAX_BITSLEFT - bitsleft) >> 3; + * + * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this + * isn't really better: + * + * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3; + * + * An alternative which can be marginally better is the following: + * + * in_next += sizeof(bitbuf_t) - 1; + * in_next -= (bitsleft >> 3) & 0x7; + * + * It seems this would increase the number of CPU instructions from 3 (sub, shr, + * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield + * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially + * more efficient because the length of the longest dependency chain decreases + * from 3 to 2. This alternative also has the advantage that it ignores the + * high bits in 'bitsleft', so it is compatible with the micro-optimization we + * use where we let the high bits of 'bitsleft' contain garbage. + */ +#define REFILL_BITS_BRANCHLESS() \ +do { \ + bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \ + in_next += sizeof(bitbuf_t) - 1; \ + in_next -= (bitsleft >> 3) & 0x7; \ + bitsleft |= MAX_BITSLEFT & ~7; \ +} while (0) + +/* + * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable + * contains at least CONSUMABLE_NBITS consumable bits. + * + * This checks for the end of input, and it doesn't guarantee + * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop. + * + * If we would overread the input buffer, we just don't read anything, leaving + * the bits zeroed but marking them filled. This simplifies the decompressor + * because it removes the need to always be able to distinguish between real + * overreads and overreads caused only by the decompressor's own lookahead. + * + * We do still keep track of the number of bytes that have been overread, for + * two reasons. First, it allows us to determine the exact number of bytes that + * were consumed once the stream ends or an uncompressed block is reached. + * Second, it allows us to stop early if the overread amount gets so large (more + * than sizeof bitbuf) that it can only be caused by a real overread. (The + * second part is arguably unneeded, since libdeflate is buffer-based; given + * infinite zeroes, it will eventually either completely fill the output buffer + * or return an error. However, we do it to be slightly more friendly to the + * not-recommended use case of decompressing with an unknown output size.) + */ +#define REFILL_BITS() \ +do { \ + if (UNALIGNED_ACCESS_IS_FAST && \ + likely(in_end - in_next >= sizeof(bitbuf_t))) { \ + REFILL_BITS_BRANCHLESS(); \ + } else { \ + while ((u8)bitsleft < CONSUMABLE_NBITS) { \ + if (likely(in_next != in_end)) { \ + bitbuf |= (bitbuf_t)*in_next++ << \ + (u8)bitsleft; \ + } else { \ + overread_count++; \ + SAFETY_CHECK(overread_count <= \ + sizeof(bitbuf_t)); \ + } \ + bitsleft += 8; \ + } \ + } \ +} while (0) + +/* + * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the + * end of the input. It can only be used in the fastloop. + */ +#define REFILL_BITS_IN_FASTLOOP() \ +do { \ + STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \ + FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \ + if (UNALIGNED_ACCESS_IS_FAST) { \ + REFILL_BITS_BRANCHLESS(); \ + } else { \ + while ((u8)bitsleft < CONSUMABLE_NBITS) { \ + bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \ + bitsleft += 8; \ + } \ + } \ +} while (0) + +/* + * This is the worst-case maximum number of output bytes that are written to + * during each iteration of the fastloop. The worst case is 2 literals, then a + * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must + * be included for the intentional overrun in the match copy implementation. + */ +#define FASTLOOP_MAX_BYTES_WRITTEN \ + (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1) + +/* + * This is the worst-case maximum number of input bytes that are read during + * each iteration of the fastloop. To get this value, we first compute the + * greatest number of bits that can be refilled during a loop iteration. The + * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can + * be refilled later is no more than the maximum amount that can be consumed by + * 2 literals that don't need a subtable, then a match. We convert this value + * to bytes, rounding up; this gives the maximum number of bytes that 'in_next' + * can be advanced. Finally, we add sizeof(bitbuf_t) to account for + * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'. + */ +#define FASTLOOP_MAX_BYTES_READ \ + (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \ + LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \ + sizeof(bitbuf_t)) + +/***************************************************************************** + * Huffman decoding * + *****************************************************************************/ + +/* + * The fastest way to decode Huffman-encoded data is basically to use a decode + * table that maps the next TABLEBITS bits of data to their symbol. Each entry + * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'. A + * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table. + * + * Ideally, TABLEBITS and the maximum codeword length would be the same; some + * compression formats are designed with this goal in mind. Unfortunately, in + * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is + * too large for a practical TABLEBITS. It's not *that* much larger, though, so + * the workaround is to use a single level of subtables. In the main table, + * entries for prefixes of codewords longer than TABLEBITS contain a "pointer" + * to the appropriate subtable along with the number of bits it is indexed with. + * + * The most efficient way to allocate subtables is to allocate them dynamically + * after the main table. The worst-case number of table entries needed, + * including subtables, is precomputable; see the ENOUGH constants below. + * + * A useful optimization is to store the codeword lengths in the decode table so + * that they don't have to be looked up by indexing a separate table that maps + * symbols to their codeword lengths. We basically do this; however, for the + * litlen and offset codes we also implement some DEFLATE-specific optimizations + * that build in the consideration of the "extra bits" and the + * literal/length/end-of-block division. For the exact decode table entry + * format we use, see the definitions of the *_decode_results[] arrays below. + */ + + +/* + * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes, + * along with their corresponding ENOUGH values. + * + * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum + * precode codeword length. This avoids ever needing subtables. + * + * For the litlen and offset codes, we cannot realistically avoid ever needing + * subtables, since litlen and offset codewords can be up to 15 bits. A higher + * TABLEBITS reduces the number of lookups that need a subtable, which increases + * performance; however, it increases memory usage and makes building the table + * take longer, which decreases performance. We choose values that work well in + * practice, making subtables rarely needed without making the tables too large. + * + * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special + * considerations, 9 would fit the trade-off curve better. However, there is a + * performance benefit to using exactly 8 bits when it is a compile-time + * constant, as many CPUs can take the low byte more easily than the low 9 bits. + * + * zlib treats its equivalents of TABLEBITS as maximum values; whenever it + * builds a table, it caps the actual table_bits to the longest codeword. This + * makes sense in theory, as there's no need for the table to be any larger than + * needed to support the longest codeword. However, having the table bits be a + * compile-time constant is beneficial to the performance of the decode loop, so + * there is a trade-off. libdeflate currently uses the dynamic table_bits + * strategy for the litlen table only, due to its larger maximum size. + * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there + * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above. + * + * Each TABLEBITS value has a corresponding ENOUGH value that gives the + * worst-case maximum number of decode table entries, including the main table + * and all subtables. The ENOUGH value depends on three parameters: + * + * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) + * (2) the maximum number of main table bits (*_TABLEBITS) + * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) + * + * The ENOUGH values were computed using the utility program 'enough' from zlib. + */ +#define PRECODE_TABLEBITS 7 +#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ +#define LITLEN_TABLEBITS 11 +#define LITLEN_ENOUGH 2342 /* enough 288 11 15 */ +#define OFFSET_TABLEBITS 8 +#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ + +/* + * make_decode_table_entry() creates a decode table entry for the given symbol + * by combining the static part 'decode_results[sym]' with the dynamic part + * 'len', which is the remaining codeword length (the codeword length for main + * table entries, or the codeword length minus TABLEBITS for subtable entries). + * + * In all cases, we add 'len' to each of the two low-order bytes to create the + * appropriately-formatted decode table entry. See the definitions of the + * *_decode_results[] arrays below, where the entry format is described. + */ +static forceinline u32 +make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len) +{ + return decode_results[sym] + (len << 8) + len; +} + +/* + * Here is the format of our precode decode table entries. Bits not explicitly + * described contain zeroes: + * + * Bit 20-16: presym + * Bit 10-8: codeword length [not used] + * Bit 2-0: codeword length + * + * The precode decode table never has subtables, since we use + * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN. + * + * precode_decode_results[] contains the static part of the entry for each + * symbol. make_decode_table_entry() produces the final entries. + */ +static const u32 precode_decode_results[] = { +#define ENTRY(presym) ((u32)presym << 16) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , +#undef ENTRY +}; + +/* Litlen and offset decode table entry flags */ + +/* Indicates a literal entry in the litlen decode table */ +#define HUFFDEC_LITERAL 0x80000000 + +/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */ +#define HUFFDEC_EXCEPTIONAL 0x00008000 + +/* Indicates a subtable pointer entry in the litlen or offset decode table */ +#define HUFFDEC_SUBTABLE_POINTER 0x00004000 + +/* Indicates an end-of-block entry in the litlen decode table */ +#define HUFFDEC_END_OF_BLOCK 0x00002000 + +/* Maximum number of bits that can be consumed by decoding a match length */ +#define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) +#define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \ + DEFLATE_MAX_EXTRA_LENGTH_BITS) + +/* + * Here is the format of our litlen decode table entries. Bits not explicitly + * described contain zeroes: + * + * Literals: + * Bit 31: 1 (HUFFDEC_LITERAL) + * Bit 23-16: literal value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Lengths: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 24-16: length base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * End of block: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 1 (HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Subtable pointer: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 30-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits + * + * This format has several desirable properties: + * + * - The codeword length, length slot base, and number of extra length bits + * are all built in. This eliminates the need to separately look up this + * information by indexing separate arrays by symbol or length slot. + * + * - The HUFFDEC_* flags enable easily distinguishing between the different + * types of entries. The HUFFDEC_LITERAL flag enables a fast path for + * literals; the high bit is used for this, as some CPUs can test the + * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag + * makes it possible to detect the two unlikely cases (subtable pointer + * and end of block) in a single bit flag test. + * + * - The low byte is the number of bits that need to be removed from the + * bitstream; this makes this value easily accessible, and it enables the + * micro-optimization of doing 'bitsleft -= entry' instead of + * 'bitsleft -= (u8)entry'. It also includes the number of extra bits, + * so they don't need to be removed separately. + * + * - The flags in bits 15-13 are arranged to be 0 when the + * "remaining codeword length" in bits 11-8 is needed, making this value + * fairly easily accessible as well via a shift and downcast. + * + * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are + * needed, making it possible to extract this value with '& 0x3F' rather + * than '& 0xF'. This value is only used as a shift amount, so this can + * save an 'and' instruction as the masking by 0x3F happens implicitly. + * + * litlen_decode_results[] contains the static part of the entry for each + * symbol. make_decode_table_entry() produces the final entries. + */ +static const u32 litlen_decode_results[] = { + + /* Literals */ +#define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16)) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , + ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , + ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , + ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , + ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , + ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , + ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , + ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , + ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , + ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , + ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , + ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , + ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , + ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , + ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , + ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , + ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , + ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , + ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , + ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , + ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , + ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , + ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , + ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , + ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , + ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , + ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , + ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , + ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , + ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , + ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , + ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , + ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , + ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , + ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , + ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , + ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , + ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , + ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , + ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , + ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , + ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , + ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , + ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , + ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , + ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , + ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , + ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , + ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , + ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , + ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , + ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , + ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , + ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , + ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , + ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , + ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , + ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , + ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , + ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , +#undef ENTRY + + /* End of block */ + HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK, + + /* Lengths */ +#define ENTRY(length_base, num_extra_bits) \ + (((u32)(length_base) << 16) | (num_extra_bits)) + ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), + ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), + ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), + ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), + ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), + ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), + ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), + ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , +#undef ENTRY +}; + +/* Maximum number of bits that can be consumed by decoding a match offset */ +#define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS) +#define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \ + DEFLATE_MAX_EXTRA_OFFSET_BITS) + +/* + * Here is the format of our offset decode table entries. Bits not explicitly + * described contain zeroes: + * + * Offsets: + * Bit 31-16: offset base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * Subtable pointer: + * Bit 31-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits + * + * These work the same way as the length entries and subtable pointer entries in + * the litlen decode table; see litlen_decode_results[] above. + */ +static const u32 offset_decode_results[] = { +#define ENTRY(offset_base, num_extra_bits) \ + (((u32)(offset_base) << 16) | (num_extra_bits)) + ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , + ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , + ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , + ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , + ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , + ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , + ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , + ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , +#undef ENTRY +}; + +/* + * The main DEFLATE decompressor structure. Since libdeflate only supports + * full-buffer decompression, this structure doesn't store the entire + * decompression state, most of which is in stack variables. Instead, this + * struct just contains the decode tables and some temporary arrays used for + * building them, as these are too large to comfortably allocate on the stack. + * + * Storing the decode tables in the decompressor struct also allows the decode + * tables for the static codes to be reused whenever two static Huffman blocks + * are decoded without an intervening dynamic block, even across streams. + */ +struct libdeflate_decompressor { + + /* + * The arrays aren't all needed at the same time. 'precode_lens' and + * 'precode_decode_table' are unneeded after 'lens' has been filled. + * Furthermore, 'lens' need not be retained after building the litlen + * and offset decode tables. In fact, 'lens' can be in union with + * 'litlen_decode_table' provided that 'offset_decode_table' is separate + * and is built first. + */ + + union { + u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + + struct { + u8 lens[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS + + DEFLATE_MAX_LENS_OVERRUN]; + + u32 precode_decode_table[PRECODE_ENOUGH]; + } l; + + u32 litlen_decode_table[LITLEN_ENOUGH]; + } u; + + u32 offset_decode_table[OFFSET_ENOUGH]; + + /* used only during build_decode_table() */ + u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; + + bool static_codes_loaded; + unsigned litlen_tablebits; + + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; +}; + +/* + * Build a table for fast decoding of symbols from a Huffman code. As input, + * this function takes the codeword length of each symbol which may be used in + * the code. As output, it produces a decode table for the canonical Huffman + * code described by the codeword lengths. The decode table is built with the + * assumption that it will be indexed with "bit-reversed" codewords, where the + * low-order bit is the first bit of the codeword. This format is used for all + * Huffman codes in DEFLATE. + * + * @decode_table + * The array in which the decode table will be generated. This array must + * have sufficient length; see the definition of the ENOUGH numbers. + * @lens + * An array which provides, for each symbol, the length of the + * corresponding codeword in bits, or 0 if the symbol is unused. This may + * alias @decode_table, since nothing is written to @decode_table until all + * @lens have been consumed. All codeword lengths are assumed to be <= + * @max_codeword_len but are otherwise considered untrusted. If they do + * not form a valid Huffman code, then the decode table is not built and + * %false is returned. + * @num_syms + * The number of symbols in the code, including all unused symbols. + * @decode_results + * An array which gives the incomplete decode result for each symbol. The + * needed values in this array will be combined with codeword lengths to + * make the final decode table entries using make_decode_table_entry(). + * @table_bits + * The log base-2 of the number of main table entries to use. + * If @table_bits_ret != NULL, then @table_bits is treated as a maximum + * value and it will be decreased if a smaller table would be sufficient. + * @max_codeword_len + * The maximum allowed codeword length for this Huffman code. + * Must be <= DEFLATE_MAX_CODEWORD_LEN. + * @sorted_syms + * A temporary array of length @num_syms. + * @table_bits_ret + * If non-NULL, then the dynamic table_bits is enabled, and the actual + * table_bits value will be returned here. + * + * Returns %true if successful; %false if the codeword lengths do not form a + * valid Huffman code. + */ +static bool +build_decode_table(u32 decode_table[], + const u8 lens[], + const unsigned num_syms, + const u32 decode_results[], + unsigned table_bits, + unsigned max_codeword_len, + u16 *sorted_syms, + unsigned *table_bits_ret) +{ + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned sym; /* current symbol */ + unsigned codeword; /* current codeword, bit-reversed */ + unsigned len; /* current codeword length in bits */ + unsigned count; /* num codewords remaining with this length */ + u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ + unsigned cur_table_end; /* end index of current table */ + unsigned subtable_prefix; /* codeword prefix of current subtable */ + unsigned subtable_start; /* start index of current subtable */ + unsigned subtable_bits; /* log2 of current subtable length */ + + /* Count how many codewords have each length, including 0. */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* + * Determine the actual maximum codeword length that was used, and + * decrease table_bits to it if allowed. + */ + while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0) + max_codeword_len--; + if (table_bits_ret != NULL) { + table_bits = MIN(table_bits, max_codeword_len); + *table_bits_ret = table_bits; + } + + /* + * Sort the symbols primarily by increasing codeword length and + * secondarily by increasing symbol value; or equivalently by their + * codewords in lexicographic order, since a canonical code is assumed. + * + * For efficiency, also compute 'codespace_used' in the same pass over + * 'len_counts[]' used to build 'offsets[]' for sorting. + */ + + /* Ensure that 'codespace_used' cannot overflow. */ + STATIC_ASSERT(sizeof(codespace_used) == 4); + STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= + DEFLATE_MAX_NUM_SYMS); + + offsets[0] = 0; + offsets[1] = len_counts[0]; + codespace_used = 0; + for (len = 1; len < max_codeword_len; len++) { + offsets[len + 1] = offsets[len] + len_counts[len]; + codespace_used = (codespace_used << 1) + len_counts[len]; + } + codespace_used = (codespace_used << 1) + len_counts[len]; + + for (sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; + + sorted_syms += offsets[0]; /* Skip unused symbols */ + + /* lens[] is done being used, so we can write to decode_table[] now. */ + + /* + * Check whether the lengths form a complete code (exactly fills the + * codespace), an incomplete code (doesn't fill the codespace), or an + * overfull code (overflows the codespace). A codeword of length 'n' + * uses proportion '1/(2^n)' of the codespace. An overfull code is + * nonsensical, so is considered invalid. An incomplete code is + * considered valid only in two specific cases; see below. + */ + + /* overfull code? */ + if (unlikely(codespace_used > (1U << max_codeword_len))) + return false; + + /* incomplete code? */ + if (unlikely(codespace_used < (1U << max_codeword_len))) { + u32 entry; + unsigned i; + + if (codespace_used == 0) { + /* + * An empty code is allowed. This can happen for the + * offset code in DEFLATE, since a dynamic Huffman block + * need not contain any matches. + */ + + /* sym=0, len=1 (arbitrary) */ + entry = make_decode_table_entry(decode_results, 0, 1); + } else { + /* + * Allow codes with a single used symbol, with codeword + * length 1. The DEFLATE RFC is unclear regarding this + * case. What zlib's decompressor does is permit this + * for the litlen and offset codes and assume the + * codeword is '0' rather than '1'. We do the same + * except we allow this for precodes too, since there's + * no convincing reason to treat the codes differently. + * We also assign both codewords '0' and '1' to the + * symbol to avoid having to handle '1' specially. + */ + if (codespace_used != (1U << (max_codeword_len - 1)) || + len_counts[1] != 1) + return false; + entry = make_decode_table_entry(decode_results, + *sorted_syms, 1); + } + /* + * Note: the decode table still must be fully initialized, in + * case the stream is malformed and contains bits from the part + * of the codespace the incomplete code doesn't use. + */ + for (i = 0; i < (1U << table_bits); i++) + decode_table[i] = entry; + return true; + } + + /* + * The lengths form a complete code. Now, enumerate the codewords in + * lexicographic order and fill the decode table entries for each one. + * + * First, process all codewords with len <= table_bits. Each one gets + * '2^(table_bits-len)' direct entries in the table. + * + * Since DEFLATE uses bit-reversed codewords, these entries aren't + * consecutive but rather are spaced '2^len' entries apart. This makes + * filling them naively somewhat awkward and inefficient, since strided + * stores are less cache-friendly and preclude the use of word or + * vector-at-a-time stores to fill multiple entries per instruction. + * + * To optimize this, we incrementally double the table size. When + * processing codewords with length 'len', the table is treated as + * having only '2^len' entries, so each codeword uses just one entry. + * Then, each time 'len' is incremented, the table size is doubled and + * the first half is copied to the second half. This significantly + * improves performance over naively doing strided stores. + * + * Note that some entries copied for each table doubling may not have + * been initialized yet, but it doesn't matter since they're guaranteed + * to be initialized later (because the Huffman code is complete). + */ + codeword = 0; + len = 1; + while ((count = len_counts[len]) == 0) + len++; + cur_table_end = 1U << len; + while (len <= table_bits) { + /* Process all 'count' codewords with length 'len' bits. */ + do { + unsigned bit; + + /* Fill the first entry for the current codeword. */ + decode_table[codeword] = + make_decode_table_entry(decode_results, + *sorted_syms++, len); + + if (codeword == cur_table_end - 1) { + /* Last codeword (all 1's) */ + for (; len < table_bits; len++) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * + sizeof(decode_table[0])); + cur_table_end <<= 1; + } + return true; + } + /* + * To advance to the lexicographically next codeword in + * the canonical code, the codeword must be incremented, + * then 0's must be appended to the codeword as needed + * to match the next codeword's length. + * + * Since the codeword is bit-reversed, appending 0's is + * a no-op. However, incrementing it is nontrivial. To + * do so efficiently, use the 'bsr' instruction to find + * the last (highest order) 0 bit in the codeword, set + * it, and clear any later (higher order) 1 bits. But + * 'bsr' actually finds the highest order 1 bit, so to + * use it first flip all bits in the codeword by XOR'ing + * it with (1U << len) - 1 == cur_table_end - 1. + */ + bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); + codeword &= bit - 1; + codeword |= bit; + } while (--count); + + /* Advance to the next codeword length. */ + do { + if (++len <= table_bits) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * sizeof(decode_table[0])); + cur_table_end <<= 1; + } + } while ((count = len_counts[len]) == 0); + } + + /* Process codewords with len > table_bits. These require subtables. */ + cur_table_end = 1U << table_bits; + subtable_prefix = -1; + subtable_start = 0; + for (;;) { + u32 entry; + unsigned i; + unsigned stride; + unsigned bit; + + /* + * Start a new subtable if the first 'table_bits' bits of the + * codeword don't match the prefix of the current subtable. + */ + if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { + subtable_prefix = (codeword & ((1U << table_bits) - 1)); + subtable_start = cur_table_end; + /* + * Calculate the subtable length. If the codeword has + * length 'table_bits + n', then the subtable needs + * '2^n' entries. But it may need more; if fewer than + * '2^n' codewords of length 'table_bits + n' remain, + * then the length will need to be incremented to bring + * in longer codewords until the subtable can be + * completely filled. Note that because the Huffman + * code is complete, it will always be possible to fill + * the subtable eventually. + */ + subtable_bits = len - table_bits; + codespace_used = count; + while (codespace_used < (1U << subtable_bits)) { + subtable_bits++; + codespace_used = (codespace_used << 1) + + len_counts[table_bits + subtable_bits]; + } + cur_table_end = subtable_start + (1U << subtable_bits); + + /* + * Create the entry that points from the main table to + * the subtable. + */ + decode_table[subtable_prefix] = + ((u32)subtable_start << 16) | + HUFFDEC_EXCEPTIONAL | + HUFFDEC_SUBTABLE_POINTER | + (subtable_bits << 8) | table_bits; + } + + /* Fill the subtable entries for the current codeword. */ + entry = make_decode_table_entry(decode_results, *sorted_syms++, + len - table_bits); + i = subtable_start + (codeword >> table_bits); + stride = 1U << (len - table_bits); + do { + decode_table[i] = entry; + i += stride; + } while (i < cur_table_end); + + /* Advance to the next codeword. */ + if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ + return true; + bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); + codeword &= bit - 1; + codeword |= bit; + count--; + while (count == 0) + count = len_counts[++len]; + } +} + +/* Build the decode table for the precode. */ +static bool +build_precode_decode_table(struct libdeflate_decompressor *d) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); + + STATIC_ASSERT(ARRAY_LEN(precode_decode_results) == + DEFLATE_NUM_PRECODE_SYMS); + + return build_decode_table(d->u.l.precode_decode_table, + d->u.precode_lens, + DEFLATE_NUM_PRECODE_SYMS, + precode_decode_results, + PRECODE_TABLEBITS, + DEFLATE_MAX_PRE_CODEWORD_LEN, + d->sorted_syms, + NULL); +} + +/* Build the decode table for the literal/length code. */ +static bool +build_litlen_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342); + + STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) == + DEFLATE_NUM_LITLEN_SYMS); + + return build_decode_table(d->u.litlen_decode_table, + d->u.l.lens, + num_litlen_syms, + litlen_decode_results, + LITLEN_TABLEBITS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + d->sorted_syms, + &d->litlen_tablebits); +} + +/* Build the decode table for the offset code. */ +static bool +build_offset_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); + + STATIC_ASSERT(ARRAY_LEN(offset_decode_results) == + DEFLATE_NUM_OFFSET_SYMS); + + return build_decode_table(d->offset_decode_table, + d->u.l.lens + num_litlen_syms, + num_offset_syms, + offset_decode_results, + OFFSET_TABLEBITS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN, + d->sorted_syms, + NULL); +} + +/***************************************************************************** + * Main decompression routine + *****************************************************************************/ + +typedef enum libdeflate_result (*decompress_func_t) + (struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +#define FUNCNAME deflate_decompress_default +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 +/* + * decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is the actual DEFLATE decompression routine, lifted out of + * deflate_decompress.c so that it can be compiled multiple times with different + * target instruction sets. + */ + +#ifndef ATTRIBUTES +# define ATTRIBUTES +#endif +#ifndef EXTRACT_VARBITS +# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) +#endif +#ifndef EXTRACT_VARBITS8 +# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) +#endif + +static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + u8 * const out_fastloop_end = + out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); + + /* Input bitstream state; see deflate_decompress.c for documentation */ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + const u8 * const in_fastloop_end = + in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); + bitbuf_t bitbuf = 0; + bitbuf_t saved_bitbuf; + u32 bitsleft = 0; + size_t overread_count = 0; + + bool is_final_block; + unsigned block_type; + unsigned num_litlen_syms; + unsigned num_offset_syms; + bitbuf_t litlen_tablemask; + u32 entry; + +next_block: + /* Starting to read the next block */ + ; + + STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); + REFILL_BITS(); + + /* BFINAL: 1 bit */ + is_final_block = bitbuf & BITMASK(1); + + /* BTYPE: 2 bits */ + block_type = (bitbuf >> 1) & BITMASK(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block */ + + /* The order in which precode lengths are stored */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + unsigned i; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); + num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); + num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); + num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); + + d->static_codes_loaded = false; + + /* + * Read the precode codeword lengths. + * + * A 64-bit bitbuffer is just one bit too small to hold the + * maximum number of precode lens, so to minimize branches we + * merge one len with the previous fields. + */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + d->u.precode_lens[deflate_precode_lens_permutation[0]] = + (bitbuf >> 17) & BITMASK(3); + bitbuf >>= 20; + bitsleft -= 20; + REFILL_BITS(); + i = 1; + do { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } else { + bitbuf >>= 17; + bitsleft -= 17; + i = 0; + do { + if ((u8)bitsleft < 3) + REFILL_BITS(); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Decode the litlen and offset codeword lengths. */ + i = 0; + do { + unsigned presym; + u8 rep_val; + unsigned rep_count; + + if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) + REFILL_BITS(); + + /* + * The code below assumes that the precode decode table + * doesn't have any subtables. + */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Decode the next precode symbol. */ + entry = d->u.l.precode_decode_table[ + bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + presym = entry >> 16; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times. */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + BITMASK(2) == 6); + rep_count = 3 + (bitbuf & BITMASK(2)); + bitbuf >>= 2; + bitsleft -= 2; + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times. */ + STATIC_ASSERT(3 + BITMASK(3) == 10); + rep_count = 3 + (bitbuf & BITMASK(3)); + bitbuf >>= 3; + bitsleft -= 3; + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times. */ + STATIC_ASSERT(11 + BITMASK(7) == 138); + rep_count = 11 + (bitbuf & BITMASK(7)); + bitbuf >>= 7; + bitsleft -= 7; + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } while (i < num_litlen_syms + num_offset_syms); + + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + u16 len, nlen; + + /* + * Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. + */ + + bitsleft -= 3; /* for BTYPE and BFINAL */ + + /* + * Align the bitstream to the next byte boundary. This means + * the next byte boundary as if we were reading a byte at a + * time. Therefore, we have to rewind 'in_next' by any bytes + * that have been refilled but not actually consumed yet (not + * counting overread bytes, which don't increment 'in_next'). + */ + bitsleft = (u8)bitsleft; + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + in_next -= (bitsleft >> 3) - overread_count; + overread_count = 0; + bitbuf = 0; + bitsleft = 0; + + SAFETY_CHECK(in_end - in_next >= 4); + len = get_unaligned_le16(in_next); + nlen = get_unaligned_le16(in_next + 2); + in_next += 4; + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + unsigned i; + + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + bitbuf >>= 3; /* for BTYPE and BFINAL */ + bitsleft -= 3; + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + litlen_tablemask = BITMASK(d->litlen_tablebits); + + /* + * This is the "fastloop" for decoding literals and matches. It does + * bounds checks on in_next and out_next in the loop conditions so that + * additional bounds checks aren't needed inside the loop body. + * + * To reduce latency, the bitbuffer is refilled and the next litlen + * decode table entry is preloaded before each loop iteration. + */ + if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) + goto generic_loop; + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + do { + u32 length, offset, lit; + const u8 *src; + u8 *dst; + + /* + * Consume the bits for the litlen decode table entry. Save the + * original bitbuf for later, in case the extra match length + * bits need to be extracted from it. + */ + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + + /* + * Begin by checking for a "fast" literal, i.e. a literal that + * doesn't need a subtable. + */ + if (entry & HUFFDEC_LITERAL) { + /* + * On 64-bit platforms, we decode up to 2 extra fast + * literals in addition to the primary item, as this + * increases performance and still leaves enough bits + * remaining for what follows. We could actually do 3, + * assuming LITLEN_TABLEBITS=11, but that actually + * decreases performance slightly (perhaps by messing + * with the branch prediction of the conditional refill + * that happens later while decoding the match offset). + * + * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN + * and FASTLOOP_MAX_BYTES_READ need to be updated if the + * number of extra literals decoded here is changed. + */ + if (/* enough bits for 2 fast literals + length + offset preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + LENGTH_MAXBITS, + OFFSET_TABLEBITS) && + /* enough bits for 2 fast literals + slow literal + litlen preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS)) { + /* 1st extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* 2nd extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* + * Another fast literal, but + * this one is in lieu of the + * primary item, so it doesn't + * count as one of the extras. + */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + } else { + /* + * Decode a literal. While doing so, preload + * the next litlen decode table entry and refill + * the bitbuffer. To reduce latency, we've + * arranged for there to be enough "preloadable" + * bits remaining to do the table preload + * independently of the refill. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( + LITLEN_TABLEBITS, LITLEN_TABLEBITS)); + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + + /* + * It's not a literal entry, so it can be a length entry, a + * subtable pointer entry, or an end-of-block entry. Detect the + * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Subtable pointer or end-of-block entry */ + + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + + /* + * A subtable is required. Load and consume the + * subtable entry. The subtable entry can be of any + * type: literal, length, or end-of-block. + */ + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + + /* + * 32-bit platforms that use the byte-at-a-time refill + * method have to do a refill here for there to always + * be enough bits to decode a literal that requires a + * subtable, then preload the next litlen decode table + * entry; or to decode a match length that requires a + * subtable, then preload the offset decode table entry. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS) || + !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, + OFFSET_TABLEBITS)) + REFILL_BITS_IN_FASTLOOP(); + if (entry & HUFFDEC_LITERAL) { + /* Decode a literal that required a subtable. */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + /* Else, it's a length that required a subtable. */ + } + + /* + * Decode the match length: the length base value associated + * with the litlen symbol (which we extract from the decode + * table entry), plus the extra length bits. We don't need to + * consume the extra length bits here, as they were included in + * the bits consumed by the entry earlier. We also don't need + * to check for too-long matches here, as this is inside the + * fastloop where it's already been verified that the output + * buffer has enough space remaining to copy a max-length match. + */ + length = entry >> 16; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* + * Decode the match offset. There are enough "preloadable" bits + * remaining to preload the offset decode table entry, but a + * refill might be needed before consuming it. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, + OFFSET_TABLEBITS)); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, + LITLEN_TABLEBITS)) { + /* + * Decoding a match offset on a 64-bit platform. We may + * need to refill once, but then we can decode the whole + * offset and preload the next litlen table entry. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + if (unlikely((u8)bitsleft < OFFSET_MAXBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + } else { + /* Decoding a match offset on a 32-bit platform */ + REFILL_BITS_IN_FASTLOOP(); + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + REFILL_BITS_IN_FASTLOOP(); + /* No further refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME( + OFFSET_MAXBITS - OFFSET_TABLEBITS)); + } else { + /* No refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); + } + } + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + offset = entry >> 16; + offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* Validate the match offset; needed even in the fastloop. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + /* + * Before starting to issue the instructions to copy the match, + * refill the bitbuffer and preload the litlen decode table + * entry for the next loop iteration. This can increase + * performance by allowing the latency of the match copy to + * overlap with these other operations. To further reduce + * latency, we've arranged for there to be enough bits remaining + * to do the table preload independently of the refill, except + * on 32-bit platforms using the byte-at-a-time refill method. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD( + MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, + OFFSET_MAXFASTBITS), + LITLEN_TABLEBITS) && + unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + + /* + * Copy the match. On most CPUs the fastest method is a + * word-at-a-time copy, unconditionally copying about 5 words + * since this is enough for most matches without being too much. + * + * The normal word-at-a-time copy works for offset >= WORDBYTES, + * which is most cases. The case of offset == 1 is also common + * and is worth optimizing for, since it is just RLE encoding of + * the previous byte, which is the result of compressing long + * runs of the same byte. + * + * Writing past the match 'length' is allowed here, since it's + * been ensured there is enough output space left for a slight + * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if + * the maximum possible overrun here is changed. + */ + if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { + machine_word_t v; + + /* + * This part tends to get auto-vectorized, so keep it + * copying a multiple of 16 bytes at a time. + */ + v = (machine_word_t)0x0101010101010101 * src[0]; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST) { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + do { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } while (in_next < in_fastloop_end && out_next < out_fastloop_end); + + /* + * This is the generic loop for decoding literals and matches. This + * handles cases where in_next and out_next are close to the end of + * their respective buffers. Usually this loop isn't performance- + * critical, as most time is spent in the fastloop above instead. We + * therefore omit some optimizations here in favor of smaller code. + */ +generic_loop: + for (;;) { + u32 length, offset; + const u8 *src; + u8 *dst; + + REFILL_BITS(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + } + length = entry >> 16; + if (entry & HUFFDEC_LITERAL) { + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = length; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + if (unlikely(length > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + + if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) + REFILL_BITS(); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + if (!CAN_CONSUME(OFFSET_MAXBITS)) + REFILL_BITS(); + } + offset = entry >> 16; + offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); + bitbuf >>= (u8)entry; + bitsleft -= entry; + + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + +block_done: + /* Finished decoding a block */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + bitsleft = (u8)bitsleft; + + /* + * If any of the implicit appended zero bytes were consumed (not just + * refilled) before hitting end of stream, then the data is bad. + */ + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + + /* Optionally return the actual number of bytes consumed. */ + if (actual_in_nbytes_ret) { + /* Don't count bytes that were refilled but not consumed. */ + in_next -= (bitsleft >> 3) - overread_count; + + *actual_in_nbytes_ret = in_next - (u8 *)in; + } + + /* Optionally return the actual number of bytes written. */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 + + +/* Include architecture-specific implementation(s) if available. */ +#undef DEFAULT_IMPL +#undef arch_select_decompress_func +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) +#ifndef LIB_X86_DECOMPRESS_IMPL_H +#define LIB_X86_DECOMPRESS_IMPL_H + +/* + * BMI2 optimized version + * + * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation + * enabled yet. That would require that this be moved to its own .c file. + */ +#if HAVE_BMI2_INTRIN +# define deflate_decompress_bmi2 deflate_decompress_bmi2 +# define FUNCNAME deflate_decompress_bmi2 +# if !HAVE_BMI2_NATIVE +# define ATTRIBUTES _target_attribute("bmi2") +# endif + /* + * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the + * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic + * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)'; + * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'. + * Nevertheless, their implementation using the bzhi intrinsic is identical, + * as the bzhi instruction truncates the count to 8 bits implicitly. + */ +# ifndef __clang__ +# include <immintrin.h> +# ifdef ARCH_X86_64 +# define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) +# define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) +# else +# define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count)) +# define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count)) +# endif +# endif +/* + * decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is the actual DEFLATE decompression routine, lifted out of + * deflate_decompress.c so that it can be compiled multiple times with different + * target instruction sets. + */ + +#ifndef ATTRIBUTES +# define ATTRIBUTES +#endif +#ifndef EXTRACT_VARBITS +# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) +#endif +#ifndef EXTRACT_VARBITS8 +# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) +#endif + +static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + u8 * const out_fastloop_end = + out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); + + /* Input bitstream state; see deflate_decompress.c for documentation */ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + const u8 * const in_fastloop_end = + in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); + bitbuf_t bitbuf = 0; + bitbuf_t saved_bitbuf; + u32 bitsleft = 0; + size_t overread_count = 0; + + bool is_final_block; + unsigned block_type; + unsigned num_litlen_syms; + unsigned num_offset_syms; + bitbuf_t litlen_tablemask; + u32 entry; + +next_block: + /* Starting to read the next block */ + ; + + STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); + REFILL_BITS(); + + /* BFINAL: 1 bit */ + is_final_block = bitbuf & BITMASK(1); + + /* BTYPE: 2 bits */ + block_type = (bitbuf >> 1) & BITMASK(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block */ + + /* The order in which precode lengths are stored */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + unsigned i; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); + num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); + num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); + num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); + + d->static_codes_loaded = false; + + /* + * Read the precode codeword lengths. + * + * A 64-bit bitbuffer is just one bit too small to hold the + * maximum number of precode lens, so to minimize branches we + * merge one len with the previous fields. + */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + d->u.precode_lens[deflate_precode_lens_permutation[0]] = + (bitbuf >> 17) & BITMASK(3); + bitbuf >>= 20; + bitsleft -= 20; + REFILL_BITS(); + i = 1; + do { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } else { + bitbuf >>= 17; + bitsleft -= 17; + i = 0; + do { + if ((u8)bitsleft < 3) + REFILL_BITS(); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Decode the litlen and offset codeword lengths. */ + i = 0; + do { + unsigned presym; + u8 rep_val; + unsigned rep_count; + + if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) + REFILL_BITS(); + + /* + * The code below assumes that the precode decode table + * doesn't have any subtables. + */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Decode the next precode symbol. */ + entry = d->u.l.precode_decode_table[ + bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + presym = entry >> 16; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times. */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + BITMASK(2) == 6); + rep_count = 3 + (bitbuf & BITMASK(2)); + bitbuf >>= 2; + bitsleft -= 2; + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times. */ + STATIC_ASSERT(3 + BITMASK(3) == 10); + rep_count = 3 + (bitbuf & BITMASK(3)); + bitbuf >>= 3; + bitsleft -= 3; + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times. */ + STATIC_ASSERT(11 + BITMASK(7) == 138); + rep_count = 11 + (bitbuf & BITMASK(7)); + bitbuf >>= 7; + bitsleft -= 7; + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } while (i < num_litlen_syms + num_offset_syms); + + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + u16 len, nlen; + + /* + * Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. + */ + + bitsleft -= 3; /* for BTYPE and BFINAL */ + + /* + * Align the bitstream to the next byte boundary. This means + * the next byte boundary as if we were reading a byte at a + * time. Therefore, we have to rewind 'in_next' by any bytes + * that have been refilled but not actually consumed yet (not + * counting overread bytes, which don't increment 'in_next'). + */ + bitsleft = (u8)bitsleft; + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + in_next -= (bitsleft >> 3) - overread_count; + overread_count = 0; + bitbuf = 0; + bitsleft = 0; + + SAFETY_CHECK(in_end - in_next >= 4); + len = get_unaligned_le16(in_next); + nlen = get_unaligned_le16(in_next + 2); + in_next += 4; + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + unsigned i; + + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + bitbuf >>= 3; /* for BTYPE and BFINAL */ + bitsleft -= 3; + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + litlen_tablemask = BITMASK(d->litlen_tablebits); + + /* + * This is the "fastloop" for decoding literals and matches. It does + * bounds checks on in_next and out_next in the loop conditions so that + * additional bounds checks aren't needed inside the loop body. + * + * To reduce latency, the bitbuffer is refilled and the next litlen + * decode table entry is preloaded before each loop iteration. + */ + if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) + goto generic_loop; + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + do { + u32 length, offset, lit; + const u8 *src; + u8 *dst; + + /* + * Consume the bits for the litlen decode table entry. Save the + * original bitbuf for later, in case the extra match length + * bits need to be extracted from it. + */ + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + + /* + * Begin by checking for a "fast" literal, i.e. a literal that + * doesn't need a subtable. + */ + if (entry & HUFFDEC_LITERAL) { + /* + * On 64-bit platforms, we decode up to 2 extra fast + * literals in addition to the primary item, as this + * increases performance and still leaves enough bits + * remaining for what follows. We could actually do 3, + * assuming LITLEN_TABLEBITS=11, but that actually + * decreases performance slightly (perhaps by messing + * with the branch prediction of the conditional refill + * that happens later while decoding the match offset). + * + * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN + * and FASTLOOP_MAX_BYTES_READ need to be updated if the + * number of extra literals decoded here is changed. + */ + if (/* enough bits for 2 fast literals + length + offset preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + LENGTH_MAXBITS, + OFFSET_TABLEBITS) && + /* enough bits for 2 fast literals + slow literal + litlen preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS)) { + /* 1st extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* 2nd extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* + * Another fast literal, but + * this one is in lieu of the + * primary item, so it doesn't + * count as one of the extras. + */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + } else { + /* + * Decode a literal. While doing so, preload + * the next litlen decode table entry and refill + * the bitbuffer. To reduce latency, we've + * arranged for there to be enough "preloadable" + * bits remaining to do the table preload + * independently of the refill. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( + LITLEN_TABLEBITS, LITLEN_TABLEBITS)); + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + + /* + * It's not a literal entry, so it can be a length entry, a + * subtable pointer entry, or an end-of-block entry. Detect the + * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Subtable pointer or end-of-block entry */ + + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + + /* + * A subtable is required. Load and consume the + * subtable entry. The subtable entry can be of any + * type: literal, length, or end-of-block. + */ + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + + /* + * 32-bit platforms that use the byte-at-a-time refill + * method have to do a refill here for there to always + * be enough bits to decode a literal that requires a + * subtable, then preload the next litlen decode table + * entry; or to decode a match length that requires a + * subtable, then preload the offset decode table entry. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS) || + !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, + OFFSET_TABLEBITS)) + REFILL_BITS_IN_FASTLOOP(); + if (entry & HUFFDEC_LITERAL) { + /* Decode a literal that required a subtable. */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + /* Else, it's a length that required a subtable. */ + } + + /* + * Decode the match length: the length base value associated + * with the litlen symbol (which we extract from the decode + * table entry), plus the extra length bits. We don't need to + * consume the extra length bits here, as they were included in + * the bits consumed by the entry earlier. We also don't need + * to check for too-long matches here, as this is inside the + * fastloop where it's already been verified that the output + * buffer has enough space remaining to copy a max-length match. + */ + length = entry >> 16; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* + * Decode the match offset. There are enough "preloadable" bits + * remaining to preload the offset decode table entry, but a + * refill might be needed before consuming it. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, + OFFSET_TABLEBITS)); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, + LITLEN_TABLEBITS)) { + /* + * Decoding a match offset on a 64-bit platform. We may + * need to refill once, but then we can decode the whole + * offset and preload the next litlen table entry. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + if (unlikely((u8)bitsleft < OFFSET_MAXBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + } else { + /* Decoding a match offset on a 32-bit platform */ + REFILL_BITS_IN_FASTLOOP(); + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + REFILL_BITS_IN_FASTLOOP(); + /* No further refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME( + OFFSET_MAXBITS - OFFSET_TABLEBITS)); + } else { + /* No refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); + } + } + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + offset = entry >> 16; + offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* Validate the match offset; needed even in the fastloop. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + /* + * Before starting to issue the instructions to copy the match, + * refill the bitbuffer and preload the litlen decode table + * entry for the next loop iteration. This can increase + * performance by allowing the latency of the match copy to + * overlap with these other operations. To further reduce + * latency, we've arranged for there to be enough bits remaining + * to do the table preload independently of the refill, except + * on 32-bit platforms using the byte-at-a-time refill method. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD( + MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, + OFFSET_MAXFASTBITS), + LITLEN_TABLEBITS) && + unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + + /* + * Copy the match. On most CPUs the fastest method is a + * word-at-a-time copy, unconditionally copying about 5 words + * since this is enough for most matches without being too much. + * + * The normal word-at-a-time copy works for offset >= WORDBYTES, + * which is most cases. The case of offset == 1 is also common + * and is worth optimizing for, since it is just RLE encoding of + * the previous byte, which is the result of compressing long + * runs of the same byte. + * + * Writing past the match 'length' is allowed here, since it's + * been ensured there is enough output space left for a slight + * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if + * the maximum possible overrun here is changed. + */ + if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { + machine_word_t v; + + /* + * This part tends to get auto-vectorized, so keep it + * copying a multiple of 16 bytes at a time. + */ + v = (machine_word_t)0x0101010101010101 * src[0]; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST) { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + do { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } while (in_next < in_fastloop_end && out_next < out_fastloop_end); + + /* + * This is the generic loop for decoding literals and matches. This + * handles cases where in_next and out_next are close to the end of + * their respective buffers. Usually this loop isn't performance- + * critical, as most time is spent in the fastloop above instead. We + * therefore omit some optimizations here in favor of smaller code. + */ +generic_loop: + for (;;) { + u32 length, offset; + const u8 *src; + u8 *dst; + + REFILL_BITS(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + } + length = entry >> 16; + if (entry & HUFFDEC_LITERAL) { + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = length; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + if (unlikely(length > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + + if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) + REFILL_BITS(); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + if (!CAN_CONSUME(OFFSET_MAXBITS)) + REFILL_BITS(); + } + offset = entry >> 16; + offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); + bitbuf >>= (u8)entry; + bitsleft -= entry; + + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + +block_done: + /* Finished decoding a block */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + bitsleft = (u8)bitsleft; + + /* + * If any of the implicit appended zero bytes were consumed (not just + * refilled) before hitting end of stream, then the data is bad. + */ + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + + /* Optionally return the actual number of bytes consumed. */ + if (actual_in_nbytes_ret) { + /* Don't count bytes that were refilled but not consumed. */ + in_next -= (bitsleft >> 3) - overread_count; + + *actual_in_nbytes_ret = in_next - (u8 *)in; + } + + /* Optionally return the actual number of bytes written. */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES +#undef EXTRACT_VARBITS +#undef EXTRACT_VARBITS8 + +#endif /* HAVE_BMI2_INTRIN */ + +#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE +#define DEFAULT_IMPL deflate_decompress_bmi2 +#else +static inline decompress_func_t +arch_select_decompress_func(void) +{ +#ifdef deflate_decompress_bmi2 + if (HAVE_BMI2(get_x86_cpu_features())) + return deflate_decompress_bmi2; +#endif + return NULL; +} +#define arch_select_decompress_func arch_select_decompress_func +#endif + +#endif /* LIB_X86_DECOMPRESS_IMPL_H */ + +#endif + +#ifndef DEFAULT_IMPL +# define DEFAULT_IMPL deflate_decompress_default +#endif + +#ifdef arch_select_decompress_func +static enum libdeflate_result +dispatch_decomp(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +static volatile decompress_func_t decompress_impl = dispatch_decomp; + +/* Choose the best implementation at runtime. */ +static enum libdeflate_result +dispatch_decomp(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + decompress_func_t f = arch_select_decompress_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + decompress_impl = f; + return f(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} +#else +/* The best implementation is statically known, so call it directly. */ +# define decompress_impl DEFAULT_IMPL +#endif + +/* + * This is the main DEFLATE decompression routine. See libdeflate.h for the + * documentation. + * + * Note that the real code is in decompress_template.h. The part here just + * handles calling the appropriate implementation depending on the CPU features + * at runtime. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} + +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_deflate_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} + +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options) +{ + struct libdeflate_decompressor *d; + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + d = (options->malloc_func ? options->malloc_func : + libdeflate_default_malloc_func)(sizeof(*d)); + if (d == NULL) + return NULL; + /* + * Note that only certain parts of the decompressor actually must be + * initialized here: + * + * - 'static_codes_loaded' must be initialized to false. + * + * - The first half of the main portion of each decode table must be + * initialized to any value, to avoid reading from uninitialized + * memory during table expansion in build_decode_table(). (Although, + * this is really just to avoid warnings with dynamic tools like + * valgrind, since build_decode_table() is guaranteed to initialize + * all entries eventually anyway.) + * + * - 'free_func' must be set. + * + * But for simplicity, we currently just zero the whole decompressor. + */ + memset(d, 0, sizeof(*d)); + d->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; + return d; +} + +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void) +{ + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_decompressor_ex(&defaults); +} + +LIBDEFLATEAPI void +libdeflate_free_decompressor(struct libdeflate_decompressor *d) +{ + if (d) + d->free_func(d); +} + + +/* + * utils.c - utility functions for libdeflate + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef FREESTANDING +# define malloc NULL +# define free NULL +#else +# include <stdlib.h> +#endif + +malloc_func_t libdeflate_default_malloc_func = malloc; +free_func_t libdeflate_default_free_func = free; + +void * +libdeflate_aligned_malloc(malloc_func_t malloc_func, + size_t alignment, size_t size) +{ + void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); + + if (ptr) { + void *orig_ptr = ptr; + + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); + ((void **)ptr)[-1] = orig_ptr; + } + return ptr; +} + +void +libdeflate_aligned_free(free_func_t free_func, void *ptr) +{ + (*free_func)(((void **)ptr)[-1]); +} + +LIBDEFLATEAPI void +libdeflate_set_memory_allocator(malloc_func_t malloc_func, + free_func_t free_func) +{ + libdeflate_default_malloc_func = malloc_func; + libdeflate_default_free_func = free_func; +} + +/* + * Implementations of libc functions for freestanding library builds. + * Normal library builds don't use these. Not optimized yet; usually the + * compiler expands these functions and doesn't actually call them anyway. + */ +#ifdef FREESTANDING +#undef memset +void * __attribute__((weak)) +memset(void *s, int c, size_t n) +{ + u8 *p = s; + size_t i; + + for (i = 0; i < n; i++) + p[i] = c; + return s; +} + +#undef memcpy +void * __attribute__((weak)) +memcpy(void *dest, const void *src, size_t n) +{ + u8 *d = dest; + const u8 *s = src; + size_t i; + + for (i = 0; i < n; i++) + d[i] = s[i]; + return dest; +} + +#undef memmove +void * __attribute__((weak)) +memmove(void *dest, const void *src, size_t n) +{ + u8 *d = dest; + const u8 *s = src; + size_t i; + + if (d <= s) + return memcpy(d, s, n); + + for (i = n; i > 0; i--) + d[i - 1] = s[i - 1]; + return dest; +} + +#undef memcmp +int __attribute__((weak)) +memcmp(const void *s1, const void *s2, size_t n) +{ + const u8 *p1 = s1; + const u8 *p2 = s2; + size_t i; + + for (i = 0; i < n; i++) { + if (p1[i] != p2[i]) + return (int)p1[i] - (int)p2[i]; + } + return 0; +} +#endif /* FREESTANDING */ + +#ifdef LIBDEFLATE_ENABLE_ASSERTIONS +#include <stdio.h> +#include <stdlib.h> +void +libdeflate_assertion_failed(const char *expr, const char *file, int line) +{ + fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line); + abort(); +} +#endif /* LIBDEFLATE_ENABLE_ASSERTIONS */ + +/* + * x86/cpu_features.c - feature detection for x86 CPUs + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#if HAVE_DYNAMIC_X86_CPU_FEATURES + +/* + * With old GCC versions we have to manually save and restore the x86_32 PIC + * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 + */ +#if defined(ARCH_X86_32) && defined(__PIC__) +# define EBX_CONSTRAINT "=&r" +#else +# define EBX_CONSTRAINT "=b" +#endif + +/* Execute the CPUID instruction. */ +static inline void +cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) +{ +#ifdef _MSC_VER + int result[4]; + + __cpuidex(result, leaf, subleaf); + *a = result[0]; + *b = result[1]; + *c = result[2]; + *d = result[3]; +#else + __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); +#endif +} + +/* Read an extended control register. */ +static inline u64 +read_xcr(u32 index) +{ +#ifdef _MSC_VER + return _xgetbv(index); +#else + u32 d, a; + + /* + * Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. + * + * This must be 'volatile' to prevent this code from being moved out + * from under the check for OSXSAVE. + */ + __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : + "=d" (d), "=a" (a) : "c" (index)); + + return ((u64)d << 32) | a; +#endif +} + +static const struct cpu_feature x86_cpu_feature_table[] = { + {X86_CPU_FEATURE_SSE2, "sse2"}, + {X86_CPU_FEATURE_PCLMUL, "pclmul"}, + {X86_CPU_FEATURE_AVX, "avx"}, + {X86_CPU_FEATURE_AVX2, "avx2"}, + {X86_CPU_FEATURE_BMI2, "bmi2"}, +}; + +volatile u32 libdeflate_x86_cpu_features = 0; + +/* Initialize libdeflate_x86_cpu_features. */ +void libdeflate_init_x86_cpu_features(void) +{ + u32 max_leaf, a, b, c, d; + u64 xcr0 = 0; + u32 features = 0; + + /* EAX=0: Highest Function Parameter and Manufacturer ID */ + cpuid(0, 0, &max_leaf, &b, &c, &d); + if (max_leaf < 1) + goto out; + + /* EAX=1: Processor Info and Feature Bits */ + cpuid(1, 0, &a, &b, &c, &d); + if (d & (1 << 26)) + features |= X86_CPU_FEATURE_SSE2; + if (c & (1 << 1)) + features |= X86_CPU_FEATURE_PCLMUL; + if (c & (1 << 27)) + xcr0 = read_xcr(0); + if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVX; + + if (max_leaf < 7) + goto out; + + /* EAX=7, ECX=0: Extended Features */ + cpuid(7, 0, &a, &b, &c, &d); + if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVX2; + if (b & (1 << 8)) + features |= X86_CPU_FEATURE_BMI2; + +out: + disable_cpu_features_for_testing(&features, x86_cpu_feature_table, + ARRAY_LEN(x86_cpu_feature_table)); + + libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; +} + +#endif /* HAVE_DYNAMIC_X86_CPU_FEATURES */ diff --git a/source/fbxplugin/libdeflate.h b/source/fbxplugin/libdeflate.h new file mode 100644 index 0000000000000000000000000000000000000000..382d895de89c237960a6b4d8a5ba32013e67ba53 --- /dev/null +++ b/source/fbxplugin/libdeflate.h @@ -0,0 +1,411 @@ +/* + * libdeflate.h - public header for libdeflate + */ + +#ifndef LIBDEFLATE_H +#define LIBDEFLATE_H + +#include <stddef.h> +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBDEFLATE_VERSION_MAJOR 1 +#define LIBDEFLATE_VERSION_MINOR 18 +#define LIBDEFLATE_VERSION_STRING "1.18" + +/* + * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause + * __declspec(dllimport) to be used. This should be done when it's easy to do. + * Otherwise it's fine to skip it, since it is a very minor performance + * optimization that is irrelevant for most use cases of libdeflate. + */ +#ifndef LIBDEFLATEAPI +# if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__)) +# define LIBDEFLATEAPI __declspec(dllimport) +# else +# define LIBDEFLATEAPI +# endif +#endif + +/* ========================================================================== */ +/* Compression */ +/* ========================================================================== */ + +struct libdeflate_compressor; +struct libdeflate_options; + +/* + * libdeflate_alloc_compressor() allocates a new compressor that supports + * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression + * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = + * medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means + * "no compression", specifically "create a valid stream, but only emit + * uncompressed blocks" (this will expand the data slightly). + * + * The return value is a pointer to the new compressor, or NULL if out of memory + * or if the compression level is invalid (i.e. outside the range [0, 12]). + * + * Note: for compression, the sliding window size is defined at compilation time + * to 32768, the largest size permissible in the DEFLATE format. It cannot be + * changed at runtime. + * + * A single compressor is not safe to use by multiple threads concurrently. + * However, different threads may use different compressors concurrently. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor(int compression_level); + +/* + * Like libdeflate_alloc_compressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_compressor * +libdeflate_alloc_compressor_ex(int compression_level, + const struct libdeflate_options *options); + +/* + * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of + * data. It attempts to compress 'in_nbytes' bytes of data located at 'in' and + * write the result to 'out', which has space for 'out_nbytes_avail' bytes. The + * return value is the compressed size in bytes, or 0 if the data could not be + * compressed to 'out_nbytes_avail' bytes or fewer (but see note below). + * + * If compression is successful, then the output data is guaranteed to be a + * valid DEFLATE stream that decompresses to the input data. No other + * guarantees are made about the output data. Notably, different versions of + * libdeflate can produce different compressed data for the same uncompressed + * data, even at the same compression level. Do ***NOT*** do things like + * writing tests that compare compressed data to a golden output, as this can + * break when libdeflate is updated. (This property isn't specific to + * libdeflate; the same is true for zlib and other compression libraries too.) + */ +LIBDEFLATEAPI size_t +libdeflate_deflate_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the + * number of bytes of compressed data that may be produced by compressing any + * buffer of length less than or equal to 'in_nbytes' using + * libdeflate_deflate_compress() with the specified compressor. This bound will + * necessarily be a number greater than or equal to 'in_nbytes'. It may be an + * overestimate of the true upper bound. The return value is guaranteed to be + * the same for all invocations with the same compressor and same 'in_nbytes'. + * + * As a special case, 'compressor' may be NULL. This causes the bound to be + * taken across *any* libdeflate_compressor that could ever be allocated with + * this build of the library, with any options. + * + * Note that this function is not necessary in many applications. With + * block-based compression, it is usually preferable to separately store the + * uncompressed size of each block and to store any blocks that did not compress + * to less than their original size uncompressed. In that scenario, there is no + * need to know the worst-case compressed size, since the maximum number of + * bytes of compressed data that may be used would always be one less than the + * input length. You can just pass a buffer of that size to + * libdeflate_deflate_compress() and store the data uncompressed if + * libdeflate_deflate_compress() returns 0, indicating that the compressed data + * did not fit into the provided output buffer. + */ +LIBDEFLATEAPI size_t +libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * Like libdeflate_deflate_compress(), but uses the zlib wrapper format instead + * of raw DEFLATE. + */ +LIBDEFLATEAPI size_t +libdeflate_zlib_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like libdeflate_deflate_compress_bound(), but assumes the data will be + * compressed with libdeflate_zlib_compress() rather than with + * libdeflate_deflate_compress(). + */ +LIBDEFLATEAPI size_t +libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * Like libdeflate_deflate_compress(), but uses the gzip wrapper format instead + * of raw DEFLATE. + */ +LIBDEFLATEAPI size_t +libdeflate_gzip_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like libdeflate_deflate_compress_bound(), but assumes the data will be + * compressed with libdeflate_gzip_compress() rather than with + * libdeflate_deflate_compress(). + */ +LIBDEFLATEAPI size_t +libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * libdeflate_free_compressor() frees a compressor that was allocated with + * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is + * taken. + */ +LIBDEFLATEAPI void +libdeflate_free_compressor(struct libdeflate_compressor *compressor); + +/* ========================================================================== */ +/* Decompression */ +/* ========================================================================== */ + +struct libdeflate_decompressor; +struct libdeflate_options; + +/* + * libdeflate_alloc_decompressor() allocates a new decompressor that can be used + * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to + * the new decompressor, or NULL if out of memory. + * + * This function takes no parameters, and the returned decompressor is valid for + * decompressing data that was compressed at any compression level and with any + * sliding window size. + * + * A single decompressor is not safe to use by multiple threads concurrently. + * However, different threads may use different decompressors concurrently. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor(void); + +/* + * Like libdeflate_alloc_decompressor(), but adds the 'options' argument. + */ +LIBDEFLATEAPI struct libdeflate_decompressor * +libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options); + +/* + * Result of a call to libdeflate_deflate_decompress(), + * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). + */ +enum libdeflate_result { + /* Decompression was successful. */ + LIBDEFLATE_SUCCESS = 0, + + /* Decompression failed because the compressed data was invalid, + * corrupt, or otherwise unsupported. */ + LIBDEFLATE_BAD_DATA = 1, + + /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have + * decompressed to fewer than 'out_nbytes_avail' bytes. */ + LIBDEFLATE_SHORT_OUTPUT = 2, + + /* The data would have decompressed to more than 'out_nbytes_avail' + * bytes. */ + LIBDEFLATE_INSUFFICIENT_SPACE = 3, +}; + +/* + * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer + * 'in' with compressed size up to 'in_nbytes' bytes. The uncompressed data is + * written to 'out', a buffer with size 'out_nbytes_avail' bytes. If + * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. Otherwise, + * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the + * contents of the output buffer are undefined. + * + * Decompression stops at the end of the DEFLATE stream (as indicated by the + * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. + * + * libdeflate_deflate_decompress() can be used in cases where the actual + * uncompressed size is known (recommended) or unknown (not recommended): + * + * - If the actual uncompressed size is known, then pass the actual + * uncompressed size as 'out_nbytes_avail' and pass NULL for + * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail + * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the + * specified number of bytes. + * + * - If the actual uncompressed size is unknown, then provide a non-NULL + * 'actual_out_nbytes_ret' and provide a buffer with some size + * 'out_nbytes_avail' that you think is large enough to hold all the + * uncompressed data. In this case, if the data decompresses to less than + * or equal to 'out_nbytes_avail' bytes, then + * libdeflate_deflate_decompress() will write the actual uncompressed size + * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, + * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was + * not large enough but no other problems were encountered, or another + * nonzero result code if decompression failed for another reason. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, + * then the actual compressed size of the DEFLATE stream (aligned to the next + * byte boundary) is written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format + * instead of raw DEFLATE. + * + * Decompression will stop at the end of the zlib stream, even if it is shorter + * than 'in_nbytes'. If you need to know exactly where the zlib stream ended, + * use libdeflate_zlib_decompress_ex(). + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first zlib-compressed stream in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format + * instead of raw DEFLATE. + * + * If multiple gzip-compressed members are concatenated, then only the first + * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need + * multi-member support. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first gzip-compressed member in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * libdeflate_free_decompressor() frees a decompressor that was allocated with + * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action + * is taken. + */ +LIBDEFLATEAPI void +libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); + +/* ========================================================================== */ +/* Checksums */ +/* ========================================================================== */ + +/* + * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of + * data and returns the updated checksum. When starting a new checksum, the + * required initial value for 'adler' is 1. This value is also returned when + * 'buffer' is specified as NULL. + */ +LIBDEFLATEAPI uint32_t +libdeflate_adler32(uint32_t adler, const void *buffer, size_t len); + + +/* + * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data + * and returns the updated checksum. When starting a new checksum, the required + * initial value for 'crc' is 0. This value is also returned when 'buffer' is + * specified as NULL. + */ +LIBDEFLATEAPI uint32_t +libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); + +/* ========================================================================== */ +/* Custom memory allocator */ +/* ========================================================================== */ + +/* + * Install a custom memory allocator which libdeflate will use for all memory + * allocations by default. 'malloc_func' is a function that must behave like + * malloc(), and 'free_func' is a function that must behave like free(). + * + * The per-(de)compressor custom memory allocator that can be specified in + * 'struct libdeflate_options' takes priority over this. + * + * This doesn't affect the free() function that will be used to free + * (de)compressors that were already in existence when this is called. + */ +LIBDEFLATEAPI void +libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), + void (*free_func)(void *)); + +/* + * Advanced options. This is the options structure that + * libdeflate_alloc_compressor_ex() and libdeflate_alloc_decompressor_ex() + * require. Most users won't need this and should just use the non-"_ex" + * functions instead. If you do need this, it should be initialized like this: + * + * struct libdeflate_options options; + * + * memset(&options, 0, sizeof(options)); + * options.sizeof_options = sizeof(options); + * // Then set the fields that you need to override the defaults for. + */ +struct libdeflate_options { + + /* + * This field must be set to the struct size. This field exists for + * extensibility, so that fields can be appended to this struct in + * future versions of libdeflate while still supporting old binaries. + */ + size_t sizeof_options; + + /* + * An optional custom memory allocator to use for this (de)compressor. + * 'malloc_func' must be a function that behaves like malloc(), and + * 'free_func' must be a function that behaves like free(). + * + * This is useful in cases where a process might have multiple users of + * libdeflate who want to use different memory allocators. For example, + * a library might want to use libdeflate with a custom memory allocator + * without interfering with user code that might use libdeflate too. + * + * This takes priority over the "global" memory allocator (which by + * default is malloc() and free(), but can be changed by + * libdeflate_set_memory_allocator()). Moreover, libdeflate will never + * call the "global" memory allocator if a per-(de)compressor custom + * allocator is always given. + */ + void *(*malloc_func)(size_t); + void (*free_func)(void *); +}; + +#ifdef __cplusplus +} +#endif + +#endif /* LIBDEFLATE_H */ diff --git a/source/fbxplugin/plugin_main.cpp b/source/fbxplugin/plugin_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..08d1cacab6f669d18793e3e7a41e01bbcf0f4283 --- /dev/null +++ b/source/fbxplugin/plugin_main.cpp @@ -0,0 +1,51 @@ +#include <xcommon/IXPlugin.h> +#include "ModelLoader.h" + +class CFBXPlugin: public IXUnknownImplementation<IXPlugin> +{ +public: + void XMETHODCALLTYPE startup(IXCore *pCore) override + { + m_pCore = pCore; + } + + void XMETHODCALLTYPE shutdown() override + { + } + + UINT XMETHODCALLTYPE getInterfaceCount() override + { + return(1); + } + const XGUID* XMETHODCALLTYPE getInterfaceGUID(UINT id) override + { + static XGUID s_guid; + switch(id) + { + case 0: + s_guid = IXMODELLOADER_GUID; + break; + + default: + return(NULL); + } + return(&s_guid); + } + void XMETHODCALLTYPE getInterface(UINT id, void **ppOut) override + { + switch(id) + { + case 0: + *ppOut = new CModelLoader(m_pCore->getFileSystem()); + break; + + default: + *ppOut = NULL; + } + } + +private: + IXCore *m_pCore = NULL; +}; + +DECLARE_XPLUGIN(CFBXPlugin); diff --git a/source/xcommon/IXModelLoader.h b/source/xcommon/IXModelLoader.h index f6e99f3a7fc81cd1e8341dfd7d6103fac1f9a133..f4bb15d67c9290efd11d93e33808714829bbd16b 100644 --- a/source/xcommon/IXModelLoader.h +++ b/source/xcommon/IXModelLoader.h @@ -33,7 +33,7 @@ struct XModelInfo //! количество костей UINT uBoneCount; - //! количетсов анимаций + //! количество анимаций UINT uAnimationCount; //! габариты в метрах