Add zstd dict builder.

This commit is contained in:
Bartosz Taudul 2021-05-15 14:56:38 +02:00
parent 959ddc3501
commit 859b8e4193
No known key found for this signature in database
GPG Key ID: B7FE2008B7575DF3
17 changed files with 5884 additions and 0 deletions

View File

@ -123,6 +123,10 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_ddict.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c" />
<ClCompile Include="..\..\src\capture.cpp" />
</ItemGroup>
<ItemGroup>
@ -182,6 +186,9 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_ddict.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_block.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\zstd\zdict.h" />
<ClInclude Include="..\..\..\zstd\zstd.h" />
<ClInclude Include="..\..\..\zstd\zstd_errors.h" />
</ItemGroup>

View File

@ -25,6 +25,9 @@
<Filter Include="zstd\decompress">
<UniqueIdentifier>{c1f99170-d904-4af1-8010-0a3ded5736c8}</UniqueIdentifier>
</Filter>
<Filter Include="zstd\dictBuilder">
<UniqueIdentifier>{456e6786-ea57-42b8-ae38-829cd2d918bd}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\common\tracy_lz4.cpp">
@ -144,6 +147,18 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c">
<Filter>zstd\decompress</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\common\tracy_lz4.hpp">
@ -320,5 +335,14 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h">
<Filter>zstd\decompress</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\zdict.h">
<Filter>zstd</Filter>
</ClInclude>
</ItemGroup>
</Project>

View File

@ -122,6 +122,10 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_ddict.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c" />
<ClCompile Include="..\..\src\csvexport.cpp" />
</ItemGroup>
<ItemGroup>
@ -180,6 +184,9 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_ddict.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_block.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\zstd\zdict.h" />
<ClInclude Include="..\..\..\zstd\zstd.h" />
<ClInclude Include="..\..\..\zstd\zstd_errors.h" />
</ItemGroup>

View File

@ -25,6 +25,9 @@
<Filter Include="zstd\decompress">
<UniqueIdentifier>{d4181058-2198-4931-ae31-b7eda0312458}</UniqueIdentifier>
</Filter>
<Filter Include="zstd\dictBuilder">
<UniqueIdentifier>{873c22fe-b4d7-480d-ad67-48271296f4c1}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\common\tracy_lz4.cpp">
@ -141,6 +144,18 @@
<ClCompile Include="..\..\..\zstd\compress\zstdmt_compress.c">
<Filter>zstd\compress</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\common\tracy_lz4.hpp">
@ -314,5 +329,14 @@
<ClInclude Include="..\..\..\zstd\compress\zstdmt_compress.h">
<Filter>zstd\compress</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\zdict.h">
<Filter>zstd</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
</ItemGroup>
</Project>

View File

@ -120,6 +120,10 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_ddict.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c" />
<ClCompile Include="..\..\src\import-chrome.cpp" />
</ItemGroup>
<ItemGroup>
@ -176,6 +180,9 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_ddict.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_block.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\zstd\zdict.h" />
<ClInclude Include="..\..\..\zstd\zstd.h" />
<ClInclude Include="..\..\..\zstd\zstd_errors.h" />
</ItemGroup>

View File

@ -22,6 +22,9 @@
<Filter Include="zstd\decompress">
<UniqueIdentifier>{438fff23-197c-4b6f-91f0-74f8b3878571}</UniqueIdentifier>
</Filter>
<Filter Include="zstd\dictBuilder">
<UniqueIdentifier>{e5c7021a-e0e4-45c2-b461-e806bc036d5f}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\common\tracy_lz4.cpp">
@ -132,6 +135,18 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c">
<Filter>zstd\decompress</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\common\tracy_lz4.hpp">
@ -299,5 +314,14 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h">
<Filter>zstd\decompress</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\zdict.h">
<Filter>zstd</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
</ItemGroup>
</Project>

View File

@ -160,6 +160,10 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_ddict.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c" />
<ClCompile Include="..\..\libs\gl3w\GL\gl3w.c" />
<ClCompile Include="..\..\src\HttpRequest.cpp" />
<ClCompile Include="..\..\src\imgui_impl_glfw.cpp" />
@ -266,6 +270,9 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_ddict.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_block.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\zstd\zdict.h" />
<ClInclude Include="..\..\..\zstd\zstd.h" />
<ClInclude Include="..\..\..\zstd\zstd_errors.h" />
<ClInclude Include="..\..\libs\gl3w\GL\gl3w.h" />

View File

@ -34,6 +34,9 @@
<Filter Include="zstd\decompress">
<UniqueIdentifier>{c93ef7c5-f1df-40a6-a4e3-81441e6df174}</UniqueIdentifier>
</Filter>
<Filter Include="zstd\dictBuilder">
<UniqueIdentifier>{200ab875-91e5-4c7a-8b98-ddbae19d2f98}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\common\tracy_lz4.cpp">
@ -234,6 +237,18 @@
<ClCompile Include="..\..\..\zstd\compress\zstdmt_compress.c">
<Filter>zstd\compress</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\common\tracy_lz4.hpp">
@ -554,6 +569,15 @@
<ClInclude Include="..\..\..\zstd\compress\zstdmt_compress.h">
<Filter>zstd\compress</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\zdict.h">
<Filter>zstd</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Natvis Include="DebugVis.natvis" />

View File

@ -122,6 +122,10 @@
<ClCompile Include="..\..\..\zstd\decompress\zstd_ddict.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress.c" />
<ClCompile Include="..\..\..\zstd\decompress\zstd_decompress_block.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c" />
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c" />
<ClCompile Include="..\..\src\update.cpp" />
</ItemGroup>
<ItemGroup>
@ -180,6 +184,9 @@
<ClInclude Include="..\..\..\zstd\decompress\zstd_ddict.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_block.h" />
<ClInclude Include="..\..\..\zstd\decompress\zstd_decompress_internal.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h" />
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h" />
<ClInclude Include="..\..\..\zstd\zdict.h" />
<ClInclude Include="..\..\..\zstd\zstd.h" />
<ClInclude Include="..\..\..\zstd\zstd_errors.h" />
</ItemGroup>

View File

@ -25,6 +25,9 @@
<Filter Include="zstd\decompress">
<UniqueIdentifier>{aeb60a40-d098-408e-a8c6-3de1c75cd9b4}</UniqueIdentifier>
</Filter>
<Filter Include="zstd\dictBuilder">
<UniqueIdentifier>{375ceb06-6b2f-4a00-af80-64d17bcadaac}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\..\common\tracy_lz4.cpp">
@ -141,6 +144,18 @@
<ClCompile Include="..\..\..\zstd\common\zstd_common.c">
<Filter>zstd\common</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\cover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\divsufsort.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\fastcover.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
<ClCompile Include="..\..\..\zstd\dictBuilder\zdict.c">
<Filter>zstd\dictBuilder</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\common\tracy_lz4.hpp">
@ -314,5 +329,14 @@
<ClInclude Include="..\..\..\zstd\common\zstd_trace.h">
<Filter>zstd\common</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\zdict.h">
<Filter>zstd</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\cover.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
<ClInclude Include="..\..\..\zstd\dictBuilder\divsufsort.h">
<Filter>zstd\dictBuilder</Filter>
</ClInclude>
</ItemGroup>
</Project>

1246
zstd/dictBuilder/cover.c Normal file

File diff suppressed because it is too large Load Diff

158
zstd/dictBuilder/cover.h Normal file
View File

@ -0,0 +1,158 @@
/*
* Copyright (c) Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#ifndef ZDICT_STATIC_LINKING_ONLY
# define ZDICT_STATIC_LINKING_ONLY
#endif
#include <stdio.h> /* fprintf */
#include <stdlib.h> /* malloc, free, qsort */
#include <string.h> /* memset */
#include <time.h> /* clock */
#include "../common/mem.h" /* read */
#include "../common/pool.h"
#include "../common/threading.h"
#include "../common/zstd_internal.h" /* includes zstd.h */
#include "../zdict.h"
/**
* COVER_best_t is used for two purposes:
* 1. Synchronizing threads.
* 2. Saving the best parameters and dictionary.
*
* All of the methods except COVER_best_init() are thread safe if zstd is
* compiled with multithreaded support.
*/
typedef struct COVER_best_s {
ZSTD_pthread_mutex_t mutex;
ZSTD_pthread_cond_t cond;
size_t liveJobs;
void *dict;
size_t dictSize;
ZDICT_cover_params_t parameters;
size_t compressedSize;
} COVER_best_t;
/**
* A segment is a range in the source as well as the score of the segment.
*/
typedef struct {
U32 begin;
U32 end;
U32 score;
} COVER_segment_t;
/**
*Number of epochs and size of each epoch.
*/
typedef struct {
U32 num;
U32 size;
} COVER_epoch_info_t;
/**
* Struct used for the dictionary selection function.
*/
typedef struct COVER_dictSelection {
BYTE* dictContent;
size_t dictSize;
size_t totalCompressedSize;
} COVER_dictSelection_t;
/**
* Computes the number of epochs and the size of each epoch.
* We will make sure that each epoch gets at least 10 * k bytes.
*
* The COVER algorithms divide the data up into epochs of equal size and
* select one segment from each epoch.
*
* @param maxDictSize The maximum allowed dictionary size.
* @param nbDmers The number of dmers we are training on.
* @param k The parameter k (segment size).
* @param passes The target number of passes over the dmer corpus.
* More passes means a better dictionary.
*/
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
U32 k, U32 passes);
/**
* Warns the user when their corpus is too small.
*/
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);
/**
* Checks total compressed size of a dictionary
*/
size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
const size_t *samplesSizes, const BYTE *samples,
size_t *offsets,
size_t nbTrainSamples, size_t nbSamples,
BYTE *const dict, size_t dictBufferCapacity);
/**
* Returns the sum of the sample sizes.
*/
size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;
/**
* Initialize the `COVER_best_t`.
*/
void COVER_best_init(COVER_best_t *best);
/**
* Wait until liveJobs == 0.
*/
void COVER_best_wait(COVER_best_t *best);
/**
* Call COVER_best_wait() and then destroy the COVER_best_t.
*/
void COVER_best_destroy(COVER_best_t *best);
/**
* Called when a thread is about to be launched.
* Increments liveJobs.
*/
void COVER_best_start(COVER_best_t *best);
/**
* Called when a thread finishes executing, both on error or success.
* Decrements liveJobs and signals any waiting threads if liveJobs == 0.
* If this dictionary is the best so far save it and its parameters.
*/
void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
COVER_dictSelection_t selection);
/**
* Error function for COVER_selectDict function. Checks if the return
* value is an error.
*/
unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
/**
* Error function for COVER_selectDict function. Returns a struct where
* return.totalCompressedSize is a ZSTD error.
*/
COVER_dictSelection_t COVER_dictSelectionError(size_t error);
/**
* Always call after selectDict is called to free up used memory from
* newly created dictionary.
*/
void COVER_dictSelectionFree(COVER_dictSelection_t selection);
/**
* Called to finalize the dictionary and select one based on whether or not
* the shrink-dict flag was enabled. If enabled the dictionary used is the
* smallest dictionary within a specified regression of the compressed size
* from the largest dictionary.
*/
COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,67 @@
/*
* divsufsort.h for libdivsufsort-lite
* Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _DIVSUFSORT_H
#define _DIVSUFSORT_H 1
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/*- Prototypes -*/
/**
* Constructs the suffix array of a given string.
* @param T [0..n-1] The input string.
* @param SA [0..n-1] The output array of suffixes.
* @param n The length of the given string.
* @param openMP enables OpenMP optimization.
* @return 0 if no error occurred, -1 or -2 otherwise.
*/
int
divsufsort(const unsigned char *T, int *SA, int n, int openMP);
/**
* Constructs the burrows-wheeler transformed string of a given string.
* @param T [0..n-1] The input string.
* @param U [0..n-1] The output string. (can be T)
* @param A [0..n-1] The temporary array. (can be NULL)
* @param n The length of the given string.
* @param num_indexes The length of secondary indexes array. (can be NULL)
* @param indexes The secondary indexes array. (can be NULL)
* @param openMP enables OpenMP optimization.
* @return The primary index if no error occurred, -1 or -2 otherwise.
*/
int
divbwt(const unsigned char *T, unsigned char *U, int *A, int n, unsigned char * num_indexes, int * indexes, int openMP);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* _DIVSUFSORT_H */

View File

@ -0,0 +1,759 @@
/*
* Copyright (c) Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
/*-*************************************
* Dependencies
***************************************/
#include <stdio.h> /* fprintf */
#include <stdlib.h> /* malloc, free, qsort */
#include <string.h> /* memset */
#include <time.h> /* clock */
#ifndef ZDICT_STATIC_LINKING_ONLY
# define ZDICT_STATIC_LINKING_ONLY
#endif
#include "../common/mem.h" /* read */
#include "../common/pool.h"
#include "../common/threading.h"
#include "../common/zstd_internal.h" /* includes zstd.h */
#include "../compress/zstd_compress_internal.h" /* ZSTD_hash*() */
#include "../zdict.h"
#include "cover.h"
/*-*************************************
* Constants
***************************************/
#define FASTCOVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
#define FASTCOVER_MAX_F 31
#define FASTCOVER_MAX_ACCEL 10
#define FASTCOVER_DEFAULT_SPLITPOINT 0.75
#define DEFAULT_F 20
#define DEFAULT_ACCEL 1
/*-*************************************
* Console display
***************************************/
#ifndef LOCALDISPLAYLEVEL
static int g_displayLevel = 2;
#endif
#undef DISPLAY
#define DISPLAY(...) \
{ \
fprintf(stderr, __VA_ARGS__); \
fflush(stderr); \
}
#undef LOCALDISPLAYLEVEL
#define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
if (displayLevel >= l) { \
DISPLAY(__VA_ARGS__); \
} /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
#undef DISPLAYLEVEL
#define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
#ifndef LOCALDISPLAYUPDATE
static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
static clock_t g_time = 0;
#endif
#undef LOCALDISPLAYUPDATE
#define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
if (displayLevel >= l) { \
if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
g_time = clock(); \
DISPLAY(__VA_ARGS__); \
} \
}
#undef DISPLAYUPDATE
#define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
/*-*************************************
* Hash Functions
***************************************/
/**
* Hash the d-byte value pointed to by p and mod 2^f into the frequency vector
*/
static size_t FASTCOVER_hashPtrToIndex(const void* p, U32 f, unsigned d) {
if (d == 6) {
return ZSTD_hash6Ptr(p, f);
}
return ZSTD_hash8Ptr(p, f);
}
/*-*************************************
* Acceleration
***************************************/
typedef struct {
unsigned finalize; /* Percentage of training samples used for ZDICT_finalizeDictionary */
unsigned skip; /* Number of dmer skipped between each dmer counted in computeFrequency */
} FASTCOVER_accel_t;
static const FASTCOVER_accel_t FASTCOVER_defaultAccelParameters[FASTCOVER_MAX_ACCEL+1] = {
{ 100, 0 }, /* accel = 0, should not happen because accel = 0 defaults to accel = 1 */
{ 100, 0 }, /* accel = 1 */
{ 50, 1 }, /* accel = 2 */
{ 34, 2 }, /* accel = 3 */
{ 25, 3 }, /* accel = 4 */
{ 20, 4 }, /* accel = 5 */
{ 17, 5 }, /* accel = 6 */
{ 14, 6 }, /* accel = 7 */
{ 13, 7 }, /* accel = 8 */
{ 11, 8 }, /* accel = 9 */
{ 10, 9 }, /* accel = 10 */
};
/*-*************************************
* Context
***************************************/
typedef struct {
const BYTE *samples;
size_t *offsets;
const size_t *samplesSizes;
size_t nbSamples;
size_t nbTrainSamples;
size_t nbTestSamples;
size_t nbDmers;
U32 *freqs;
unsigned d;
unsigned f;
FASTCOVER_accel_t accelParams;
} FASTCOVER_ctx_t;
/*-*************************************
* Helper functions
***************************************/
/**
* Selects the best segment in an epoch.
* Segments of are scored according to the function:
*
* Let F(d) be the frequency of all dmers with hash value d.
* Let S_i be hash value of the dmer at position i of segment S which has length k.
*
* Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
*
* Once the dmer with hash value d is in the dictionary we set F(d) = 0.
*/
static COVER_segment_t FASTCOVER_selectSegment(const FASTCOVER_ctx_t *ctx,
U32 *freqs, U32 begin, U32 end,
ZDICT_cover_params_t parameters,
U16* segmentFreqs) {
/* Constants */
const U32 k = parameters.k;
const U32 d = parameters.d;
const U32 f = ctx->f;
const U32 dmersInK = k - d + 1;
/* Try each segment (activeSegment) and save the best (bestSegment) */
COVER_segment_t bestSegment = {0, 0, 0};
COVER_segment_t activeSegment;
/* Reset the activeDmers in the segment */
/* The activeSegment starts at the beginning of the epoch. */
activeSegment.begin = begin;
activeSegment.end = begin;
activeSegment.score = 0;
/* Slide the activeSegment through the whole epoch.
* Save the best segment in bestSegment.
*/
while (activeSegment.end < end) {
/* Get hash value of current dmer */
const size_t idx = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.end, f, d);
/* Add frequency of this index to score if this is the first occurrence of index in active segment */
if (segmentFreqs[idx] == 0) {
activeSegment.score += freqs[idx];
}
/* Increment end of segment and segmentFreqs*/
activeSegment.end += 1;
segmentFreqs[idx] += 1;
/* If the window is now too large, drop the first position */
if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
/* Get hash value of the dmer to be eliminated from active segment */
const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
segmentFreqs[delIndex] -= 1;
/* Subtract frequency of this index from score if this is the last occurrence of this index in active segment */
if (segmentFreqs[delIndex] == 0) {
activeSegment.score -= freqs[delIndex];
}
/* Increment start of segment */
activeSegment.begin += 1;
}
/* If this segment is the best so far save it */
if (activeSegment.score > bestSegment.score) {
bestSegment = activeSegment;
}
}
/* Zero out rest of segmentFreqs array */
while (activeSegment.begin < end) {
const size_t delIndex = FASTCOVER_hashPtrToIndex(ctx->samples + activeSegment.begin, f, d);
segmentFreqs[delIndex] -= 1;
activeSegment.begin += 1;
}
{
/* Zero the frequency of hash value of each dmer covered by the chosen segment. */
U32 pos;
for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
const size_t i = FASTCOVER_hashPtrToIndex(ctx->samples + pos, f, d);
freqs[i] = 0;
}
}
return bestSegment;
}
static int FASTCOVER_checkParameters(ZDICT_cover_params_t parameters,
size_t maxDictSize, unsigned f,
unsigned accel) {
/* k, d, and f are required parameters */
if (parameters.d == 0 || parameters.k == 0) {
return 0;
}
/* d has to be 6 or 8 */
if (parameters.d != 6 && parameters.d != 8) {
return 0;
}
/* k <= maxDictSize */
if (parameters.k > maxDictSize) {
return 0;
}
/* d <= k */
if (parameters.d > parameters.k) {
return 0;
}
/* 0 < f <= FASTCOVER_MAX_F*/
if (f > FASTCOVER_MAX_F || f == 0) {
return 0;
}
/* 0 < splitPoint <= 1 */
if (parameters.splitPoint <= 0 || parameters.splitPoint > 1) {
return 0;
}
/* 0 < accel <= 10 */
if (accel > 10 || accel == 0) {
return 0;
}
return 1;
}
/**
* Clean up a context initialized with `FASTCOVER_ctx_init()`.
*/
static void
FASTCOVER_ctx_destroy(FASTCOVER_ctx_t* ctx)
{
if (!ctx) return;
free(ctx->freqs);
ctx->freqs = NULL;
free(ctx->offsets);
ctx->offsets = NULL;
}
/**
* Calculate for frequency of hash value of each dmer in ctx->samples
*/
static void
FASTCOVER_computeFrequency(U32* freqs, const FASTCOVER_ctx_t* ctx)
{
const unsigned f = ctx->f;
const unsigned d = ctx->d;
const unsigned skip = ctx->accelParams.skip;
const unsigned readLength = MAX(d, 8);
size_t i;
assert(ctx->nbTrainSamples >= 5);
assert(ctx->nbTrainSamples <= ctx->nbSamples);
for (i = 0; i < ctx->nbTrainSamples; i++) {
size_t start = ctx->offsets[i]; /* start of current dmer */
size_t const currSampleEnd = ctx->offsets[i+1];
while (start + readLength <= currSampleEnd) {
const size_t dmerIndex = FASTCOVER_hashPtrToIndex(ctx->samples + start, f, d);
freqs[dmerIndex]++;
start = start + skip + 1;
}
}
}
/**
* Prepare a context for dictionary building.
* The context is only dependent on the parameter `d` and can used multiple
* times.
* Returns 0 on success or error code on error.
* The context must be destroyed with `FASTCOVER_ctx_destroy()`.
*/
static size_t
FASTCOVER_ctx_init(FASTCOVER_ctx_t* ctx,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples,
unsigned d, double splitPoint, unsigned f,
FASTCOVER_accel_t accelParams)
{
const BYTE* const samples = (const BYTE*)samplesBuffer;
const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
/* Split samples into testing and training sets */
const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
/* Checks */
if (totalSamplesSize < MAX(d, sizeof(U64)) ||
totalSamplesSize >= (size_t)FASTCOVER_MAX_SAMPLES_SIZE) {
DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
(unsigned)(totalSamplesSize >> 20), (FASTCOVER_MAX_SAMPLES_SIZE >> 20));
return ERROR(srcSize_wrong);
}
/* Check if there are at least 5 training samples */
if (nbTrainSamples < 5) {
DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid\n", nbTrainSamples);
return ERROR(srcSize_wrong);
}
/* Check if there's testing sample */
if (nbTestSamples < 1) {
DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.\n", nbTestSamples);
return ERROR(srcSize_wrong);
}
/* Zero the context */
memset(ctx, 0, sizeof(*ctx));
DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
(unsigned)trainingSamplesSize);
DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
(unsigned)testSamplesSize);
ctx->samples = samples;
ctx->samplesSizes = samplesSizes;
ctx->nbSamples = nbSamples;
ctx->nbTrainSamples = nbTrainSamples;
ctx->nbTestSamples = nbTestSamples;
ctx->nbDmers = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
ctx->d = d;
ctx->f = f;
ctx->accelParams = accelParams;
/* The offsets of each file */
ctx->offsets = (size_t*)calloc((nbSamples + 1), sizeof(size_t));
if (ctx->offsets == NULL) {
DISPLAYLEVEL(1, "Failed to allocate scratch buffers \n");
FASTCOVER_ctx_destroy(ctx);
return ERROR(memory_allocation);
}
/* Fill offsets from the samplesSizes */
{ U32 i;
ctx->offsets[0] = 0;
assert(nbSamples >= 5);
for (i = 1; i <= nbSamples; ++i) {
ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
}
}
/* Initialize frequency array of size 2^f */
ctx->freqs = (U32*)calloc(((U64)1 << f), sizeof(U32));
if (ctx->freqs == NULL) {
DISPLAYLEVEL(1, "Failed to allocate frequency table \n");
FASTCOVER_ctx_destroy(ctx);
return ERROR(memory_allocation);
}
DISPLAYLEVEL(2, "Computing frequencies\n");
FASTCOVER_computeFrequency(ctx->freqs, ctx);
return 0;
}
/**
* Given the prepared context build the dictionary.
*/
static size_t
FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
U32* freqs,
void* dictBuffer, size_t dictBufferCapacity,
ZDICT_cover_params_t parameters,
U16* segmentFreqs)
{
BYTE *const dict = (BYTE *)dictBuffer;
size_t tail = dictBufferCapacity;
/* Divide the data into epochs. We will select one segment from each epoch. */
const COVER_epoch_info_t epochs = COVER_computeEpochs(
(U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
const size_t maxZeroScoreRun = 10;
size_t zeroScoreRun = 0;
size_t epoch;
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
(U32)epochs.num, (U32)epochs.size);
/* Loop through the epochs until there are no more segments or the dictionary
* is full.
*/
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
const U32 epochBegin = (U32)(epoch * epochs.size);
const U32 epochEnd = epochBegin + epochs.size;
size_t segmentSize;
/* Select a segment */
COVER_segment_t segment = FASTCOVER_selectSegment(
ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
/* If the segment covers no dmers, then we are out of content.
* There may be new content in other epochs, for continue for some time.
*/
if (segment.score == 0) {
if (++zeroScoreRun >= maxZeroScoreRun) {
break;
}
continue;
}
zeroScoreRun = 0;
/* Trim the segment if necessary and if it is too small then we are done */
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
if (segmentSize < parameters.d) {
break;
}
/* We fill the dictionary from the back to allow the best segments to be
* referenced with the smallest offsets.
*/
tail -= segmentSize;
memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
DISPLAYUPDATE(
2, "\r%u%% ",
(unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
}
DISPLAYLEVEL(2, "\r%79s\r", "");
return tail;
}
/**
* Parameters for FASTCOVER_tryParameters().
*/
typedef struct FASTCOVER_tryParameters_data_s {
const FASTCOVER_ctx_t* ctx;
COVER_best_t* best;
size_t dictBufferCapacity;
ZDICT_cover_params_t parameters;
} FASTCOVER_tryParameters_data_t;
/**
* Tries a set of parameters and updates the COVER_best_t with the results.
* This function is thread safe if zstd is compiled with multithreaded support.
* It takes its parameters as an *OWNING* opaque pointer to support threading.
*/
static void FASTCOVER_tryParameters(void* opaque)
{
/* Save parameters as local variables */
FASTCOVER_tryParameters_data_t *const data = (FASTCOVER_tryParameters_data_t*)opaque;
const FASTCOVER_ctx_t *const ctx = data->ctx;
const ZDICT_cover_params_t parameters = data->parameters;
size_t dictBufferCapacity = data->dictBufferCapacity;
size_t totalCompressedSize = ERROR(GENERIC);
/* Initialize array to keep track of frequency of dmer within activeSegment */
U16* segmentFreqs = (U16*)calloc(((U64)1 << ctx->f), sizeof(U16));
/* Allocate space for hash table, dict, and freqs */
BYTE *const dict = (BYTE*)malloc(dictBufferCapacity);
COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
U32* freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
if (!segmentFreqs || !dict || !freqs) {
DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
goto _cleanup;
}
/* Copy the frequencies because we need to modify them */
memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
/* Build the dictionary */
{ const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
parameters, segmentFreqs);
const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
totalCompressedSize);
if (COVER_dictSelectionIsError(selection)) {
DISPLAYLEVEL(1, "Failed to select dictionary\n");
goto _cleanup;
}
}
_cleanup:
free(dict);
COVER_best_finish(data->best, parameters, selection);
free(data);
free(segmentFreqs);
COVER_dictSelectionFree(selection);
free(freqs);
}
static void
FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
ZDICT_cover_params_t* coverParams)
{
coverParams->k = fastCoverParams.k;
coverParams->d = fastCoverParams.d;
coverParams->steps = fastCoverParams.steps;
coverParams->nbThreads = fastCoverParams.nbThreads;
coverParams->splitPoint = fastCoverParams.splitPoint;
coverParams->zParams = fastCoverParams.zParams;
coverParams->shrinkDict = fastCoverParams.shrinkDict;
}
static void
FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
ZDICT_fastCover_params_t* fastCoverParams,
unsigned f, unsigned accel)
{
fastCoverParams->k = coverParams.k;
fastCoverParams->d = coverParams.d;
fastCoverParams->steps = coverParams.steps;
fastCoverParams->nbThreads = coverParams.nbThreads;
fastCoverParams->splitPoint = coverParams.splitPoint;
fastCoverParams->f = f;
fastCoverParams->accel = accel;
fastCoverParams->zParams = coverParams.zParams;
fastCoverParams->shrinkDict = coverParams.shrinkDict;
}
ZDICTLIB_API size_t
ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t parameters)
{
BYTE* const dict = (BYTE*)dictBuffer;
FASTCOVER_ctx_t ctx;
ZDICT_cover_params_t coverParams;
FASTCOVER_accel_t accelParams;
/* Initialize global data */
g_displayLevel = parameters.zParams.notificationLevel;
/* Assign splitPoint and f if not provided */
parameters.splitPoint = 1.0;
parameters.f = parameters.f == 0 ? DEFAULT_F : parameters.f;
parameters.accel = parameters.accel == 0 ? DEFAULT_ACCEL : parameters.accel;
/* Convert to cover parameter */
memset(&coverParams, 0 , sizeof(coverParams));
FASTCOVER_convertToCoverParams(parameters, &coverParams);
/* Checks */
if (!FASTCOVER_checkParameters(coverParams, dictBufferCapacity, parameters.f,
parameters.accel)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
return ERROR(parameter_outOfBound);
}
if (nbSamples == 0) {
DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
return ERROR(srcSize_wrong);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
/* Assign corresponding FASTCOVER_accel_t to accelParams*/
accelParams = FASTCOVER_defaultAccelParameters[parameters.accel];
/* Initialize context */
{
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
coverParams.d, parameters.splitPoint, parameters.f,
accelParams);
if (ZSTD_isError(initVal)) {
DISPLAYLEVEL(1, "Failed to initialize context\n");
return initVal;
}
}
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, g_displayLevel);
/* Build the dictionary */
DISPLAYLEVEL(2, "Building dictionary\n");
{
/* Initialize array to keep track of frequency of dmer within activeSegment */
U16* segmentFreqs = (U16 *)calloc(((U64)1 << parameters.f), sizeof(U16));
const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
dictBufferCapacity, coverParams, segmentFreqs);
const unsigned nbFinalizeSamples = (unsigned)(ctx.nbTrainSamples * ctx.accelParams.finalize / 100);
const size_t dictionarySize = ZDICT_finalizeDictionary(
dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
samplesBuffer, samplesSizes, nbFinalizeSamples, coverParams.zParams);
if (!ZSTD_isError(dictionarySize)) {
DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
(unsigned)dictionarySize);
}
FASTCOVER_ctx_destroy(&ctx);
free(segmentFreqs);
return dictionarySize;
}
}
ZDICTLIB_API size_t
ZDICT_optimizeTrainFromBuffer_fastCover(
void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t* parameters)
{
ZDICT_cover_params_t coverParams;
FASTCOVER_accel_t accelParams;
/* constants */
const unsigned nbThreads = parameters->nbThreads;
const double splitPoint =
parameters->splitPoint <= 0.0 ? FASTCOVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
const unsigned kIterations =
(1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
const unsigned shrinkDict = 0;
/* Local variables */
const int displayLevel = parameters->zParams.notificationLevel;
unsigned iteration = 1;
unsigned d;
unsigned k;
COVER_best_t best;
POOL_ctx *pool = NULL;
int warned = 0;
/* Checks */
if (splitPoint <= 0 || splitPoint > 1) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
return ERROR(parameter_outOfBound);
}
if (accel == 0 || accel > FASTCOVER_MAX_ACCEL) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect accel\n");
return ERROR(parameter_outOfBound);
}
if (kMinK < kMaxD || kMaxK < kMinK) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
return ERROR(parameter_outOfBound);
}
if (nbSamples == 0) {
LOCALDISPLAYLEVEL(displayLevel, 1, "FASTCOVER must have at least one input file\n");
return ERROR(srcSize_wrong);
}
if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
LOCALDISPLAYLEVEL(displayLevel, 1, "dictBufferCapacity must be at least %u\n",
ZDICT_DICTSIZE_MIN);
return ERROR(dstSize_tooSmall);
}
if (nbThreads > 1) {
pool = POOL_create(nbThreads, 1);
if (!pool) {
return ERROR(memory_allocation);
}
}
/* Initialization */
COVER_best_init(&best);
memset(&coverParams, 0 , sizeof(coverParams));
FASTCOVER_convertToCoverParams(*parameters, &coverParams);
accelParams = FASTCOVER_defaultAccelParameters[accel];
/* Turn down global display level to clean up display at level 2 and below */
g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
/* Loop through d first because each new value needs a new context */
LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
kIterations);
for (d = kMinD; d <= kMaxD; d += 2) {
/* Initialize the context for this value of d */
FASTCOVER_ctx_t ctx;
LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
{
size_t const initVal = FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint, f, accelParams);
if (ZSTD_isError(initVal)) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
COVER_best_destroy(&best);
POOL_free(pool);
return initVal;
}
}
if (!warned) {
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers, displayLevel);
warned = 1;
}
/* Loop through k reusing the same context */
for (k = kMinK; k <= kMaxK; k += kStepSize) {
/* Prepare the arguments */
FASTCOVER_tryParameters_data_t *data = (FASTCOVER_tryParameters_data_t *)malloc(
sizeof(FASTCOVER_tryParameters_data_t));
LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
if (!data) {
LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
COVER_best_destroy(&best);
FASTCOVER_ctx_destroy(&ctx);
POOL_free(pool);
return ERROR(memory_allocation);
}
data->ctx = &ctx;
data->best = &best;
data->dictBufferCapacity = dictBufferCapacity;
data->parameters = coverParams;
data->parameters.k = k;
data->parameters.d = d;
data->parameters.splitPoint = splitPoint;
data->parameters.steps = kSteps;
data->parameters.shrinkDict = shrinkDict;
data->parameters.zParams.notificationLevel = g_displayLevel;
/* Check the parameters */
if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
data->ctx->f, accel)) {
DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
free(data);
continue;
}
/* Call the function and pass ownership of data to it */
COVER_best_start(&best);
if (pool) {
POOL_add(pool, &FASTCOVER_tryParameters, data);
} else {
FASTCOVER_tryParameters(data);
}
/* Print status */
LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
(unsigned)((iteration * 100) / kIterations));
++iteration;
}
COVER_best_wait(&best);
FASTCOVER_ctx_destroy(&ctx);
}
LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
/* Fill the output buffer and parameters with output of the best parameters */
{
const size_t dictSize = best.dictSize;
if (ZSTD_isError(best.compressedSize)) {
const size_t compressedSize = best.compressedSize;
COVER_best_destroy(&best);
POOL_free(pool);
return compressedSize;
}
FASTCOVER_convertToFastCoverParams(best.parameters, parameters, f, accel);
memcpy(dictBuffer, best.dict, dictSize);
COVER_best_destroy(&best);
POOL_free(pool);
return dictSize;
}
}

1134
zstd/dictBuilder/zdict.c Normal file

File diff suppressed because it is too large Load Diff

452
zstd/zdict.h Normal file
View File

@ -0,0 +1,452 @@
/*
* Copyright (c) Yann Collet, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#ifndef DICTBUILDER_H_001
#define DICTBUILDER_H_001
#if defined (__cplusplus)
extern "C" {
#endif
/*====== Dependencies ======*/
#include <stddef.h> /* size_t */
/* ===== ZDICTLIB_API : control library symbols visibility ===== */
#ifndef ZDICTLIB_VISIBILITY
# if defined(__GNUC__) && (__GNUC__ >= 4)
# define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default")))
# else
# define ZDICTLIB_VISIBILITY
# endif
#endif
#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
# define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY
#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
# define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
#else
# define ZDICTLIB_API ZDICTLIB_VISIBILITY
#endif
/*******************************************************************************
* Zstd dictionary builder
*
* FAQ
* ===
* Why should I use a dictionary?
* ------------------------------
*
* Zstd can use dictionaries to improve compression ratio of small data.
* Traditionally small files don't compress well because there is very little
* repetion in a single sample, since it is small. But, if you are compressing
* many similar files, like a bunch of JSON records that share the same
* structure, you can train a dictionary on ahead of time on some samples of
* these files. Then, zstd can use the dictionary to find repetitions that are
* present across samples. This can vastly improve compression ratio.
*
* When is a dictionary useful?
* ----------------------------
*
* Dictionaries are useful when compressing many small files that are similar.
* The larger a file is, the less benefit a dictionary will have. Generally,
* we don't expect dictionary compression to be effective past 100KB. And the
* smaller a file is, the more we would expect the dictionary to help.
*
* How do I use a dictionary?
* --------------------------
*
* Simply pass the dictionary to the zstd compressor with
* `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
* the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
* more advanced functions that allow selecting some options, see zstd.h for
* complete documentation.
*
* What is a zstd dictionary?
* --------------------------
*
* A zstd dictionary has two pieces: Its header, and its content. The header
* contains a magic number, the dictionary ID, and entropy tables. These
* entropy tables allow zstd to save on header costs in the compressed file,
* which really matters for small data. The content is just bytes, which are
* repeated content that is common across many samples.
*
* What is a raw content dictionary?
* ---------------------------------
*
* A raw content dictionary is just bytes. It doesn't have a zstd dictionary
* header, a dictionary ID, or entropy tables. Any buffer is a valid raw
* content dictionary.
*
* How do I train a dictionary?
* ----------------------------
*
* Gather samples from your use case. These samples should be similar to each
* other. If you have several use cases, you could try to train one dictionary
* per use case.
*
* Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
* dictionary. There are a few advanced versions of this function, but this
* is a great starting point. If you want to further tune your dictionary
* you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
* you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
*
* If the dictionary training function fails, that is likely because you
* either passed too few samples, or a dictionary would not be effective
* for your data. Look at the messages that the dictionary trainer printed,
* if it doesn't say too few samples, then a dictionary would not be effective.
*
* How large should my dictionary be?
* ----------------------------------
*
* A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
* The zstd CLI defaults to a 110KB dictionary. You likely don't need a
* dictionary larger than that. But, most use cases can get away with a
* smaller dictionary. The advanced dictionary builders can automatically
* shrink the dictionary for you, and select a the smallest size that
* doesn't hurt compression ratio too much. See the `shrinkDict` parameter.
* A smaller dictionary can save memory, and potentially speed up
* compression.
*
* How many samples should I provide to the dictionary builder?
* ------------------------------------------------------------
*
* We generally recommend passing ~100x the size of the dictionary
* in samples. A few thousand should suffice. Having too few samples
* can hurt the dictionaries effectiveness. Having more samples will
* only improve the dictionaries effectiveness. But having too many
* samples can slow down the dictionary builder.
*
* How do I determine if a dictionary will be effective?
* -----------------------------------------------------
*
* Simply train a dictionary and try it out. You can use zstd's built in
* benchmarking tool to test the dictionary effectiveness.
*
* # Benchmark levels 1-3 without a dictionary
* zstd -b1e3 -r /path/to/my/files
* # Benchmark levels 1-3 with a dictioanry
* zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
*
* When should I retrain a dictionary?
* -----------------------------------
*
* You should retrain a dictionary when its effectiveness drops. Dictionary
* effectiveness drops as the data you are compressing changes. Generally, we do
* expect dictionaries to "decay" over time, as your data changes, but the rate
* at which they decay depends on your use case. Internally, we regularly
* retrain dictionaries, and if the new dictionary performs significantly
* better than the old dictionary, we will ship the new dictionary.
*
* I have a raw content dictionary, how do I turn it into a zstd dictionary?
* -------------------------------------------------------------------------
*
* If you have a raw content dictionary, e.g. by manually constructing it, or
* using a third-party dictionary builder, you can turn it into a zstd
* dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
* provide some samples of the data. It will add the zstd header to the
* raw content, which contains a dictionary ID and entropy tables, which
* will improve compression ratio, and allow zstd to write the dictionary ID
* into the frame, if you so choose.
*
* Do I have to use zstd's dictionary builder?
* -------------------------------------------
*
* No! You can construct dictionary content however you please, it is just
* bytes. It will always be valid as a raw content dictionary. If you want
* a zstd dictionary, which can improve compression ratio, use
* `ZDICT_finalizeDictionary()`.
*
* What is the attack surface of a zstd dictionary?
* ------------------------------------------------
*
* Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
* zstd should never crash, or access out-of-bounds memory no matter what
* the dictionary is. However, if an attacker can control the dictionary
* during decompression, they can cause zstd to generate arbitrary bytes,
* just like if they controlled the compressed data.
*
******************************************************************************/
/*! ZDICT_trainFromBuffer():
* Train a dictionary from an array of samples.
* Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
* f=20, and accel=1.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* Note: Dictionary training will fail if there are not enough samples to construct a
* dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
* If dictionary training fails, you should use zstd without a dictionary, as the dictionary
* would've been ineffective anyways. If you believe your samples would benefit from a dictionary
* please open an issue with details, and we can look into it.
* Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples);
typedef struct {
int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value)
* NOTE: The zstd format reserves some dictionary IDs for future use.
* You may use them in private settings, but be warned that they
* may be used by zstd in a public dictionary registry in the future.
* These dictionary IDs are:
* - low range : <= 32767
* - high range : >= (2^31)
*/
} ZDICT_params_t;
/*! ZDICT_finalizeDictionary():
* Given a custom content as a basis for dictionary, and a set of samples,
* finalize dictionary by adding headers and statistics according to the zstd
* dictionary format.
*
* Samples must be stored concatenated in a flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each
* sample in order. The samples are used to construct the statistics, so they
* should be representative of what you will compress with this dictionary.
*
* The compression level can be set in `parameters`. You should pass the
* compression level you expect to use in production. The statistics for each
* compression level differ, so tuning the dictionary for the compression level
* can help quite a bit.
*
* You can set an explicit dictionary ID in `parameters`, or allow us to pick
* a random dictionary ID for you, but we can't guarantee no collisions.
*
* The dstDictBuffer and the dictContent may overlap, and the content will be
* appended to the end of the header. If the header + the content doesn't fit in
* maxDictSize the beginning of the content is truncated to make room, since it
* is presumed that the most profitable content is at the end of the dictionary,
* since that is the cheapest to reference.
*
* `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes.
* `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
*
* @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
* or an error code, which can be tested by ZDICT_isError().
* Note: ZDICT_finalizeDictionary() will push notifications into stderr if
* instructed to, using notificationLevel>0.
* NOTE: This function currently may fail in several edge cases including:
* * Not enough samples
* * Samples are uncompressible
* * Samples are all exactly the same
*/
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
const void* dictContent, size_t dictContentSize,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters);
/*====== Helper functions ======*/
ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
#ifdef ZDICT_STATIC_LINKING_ONLY
/* ====================================================================================
* The definitions in this section are considered experimental.
* They should never be used with a dynamic library, as they may change in the future.
* They are provided for advanced usages.
* Use them only in association with static linking.
* ==================================================================================== */
#define ZDICT_CONTENTSIZE_MIN 128
#define ZDICT_DICTSIZE_MIN 256
/*! ZDICT_cover_params_t:
* k and d are the only required parameters.
* For others, value 0 means default.
*/
typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
ZDICT_params_t zParams;
} ZDICT_cover_params_t;
typedef struct {
unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
ZDICT_params_t zParams;
} ZDICT_fastCover_params_t;
/*! ZDICT_trainFromBuffer_cover():
* Train a dictionary from an array of samples using the COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* See ZDICT_trainFromBuffer() for details on failure modes.
* Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
void *dictBuffer, size_t dictBufferCapacity,
const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t parameters);
/*! ZDICT_optimizeTrainFromBuffer_cover():
* The same requirements as above hold for all the parameters except `parameters`.
* This function tries many parameter combinations and picks the best parameters.
* `*parameters` is filled with the best parameters found,
* dictionary constructed with those parameters is stored in `dictBuffer`.
*
* All of the parameters d, k, steps are optional.
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
* if steps is zero it defaults to its default value.
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
*
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* On success `*parameters` contains the parameters selected.
* See ZDICT_trainFromBuffer() for details on failure modes.
* Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
*/
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_cover_params_t* parameters);
/*! ZDICT_trainFromBuffer_fastCover():
* Train a dictionary from an array of samples using a modified version of COVER algorithm.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* d and k are required.
* All other parameters are optional, will use default values if not provided
* The resulting dictionary will be saved into `dictBuffer`.
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* See ZDICT_trainFromBuffer() for details on failure modes.
* Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory.
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
size_t dictBufferCapacity, const void *samplesBuffer,
const size_t *samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t parameters);
/*! ZDICT_optimizeTrainFromBuffer_fastCover():
* The same requirements as above hold for all the parameters except `parameters`.
* This function tries many parameter combinations (specifically, k and d combinations)
* and picks the best parameters. `*parameters` is filled with the best parameters found,
* dictionary constructed with those parameters is stored in `dictBuffer`.
* All of the parameters d, k, steps, f, and accel are optional.
* If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
* if steps is zero it defaults to its default value.
* If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
* If f is zero, default value of 20 is used.
* If accel is zero, default value of 1 is used.
*
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* On success `*parameters` contains the parameters selected.
* See ZDICT_trainFromBuffer() for details on failure modes.
* Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
*/
ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
size_t dictBufferCapacity, const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t* parameters);
typedef struct {
unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
ZDICT_params_t zParams;
} ZDICT_legacy_params_t;
/*! ZDICT_trainFromBuffer_legacy():
* Train a dictionary from an array of samples.
* Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
* The resulting dictionary will be saved into `dictBuffer`.
* `parameters` is optional and can be provided with values set to 0 to mean "default".
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
* or an error code, which can be tested with ZDICT_isError().
* See ZDICT_trainFromBuffer() for details on failure modes.
* Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
* It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
* In general, it's recommended to provide a few thousands samples, though this can vary a lot.
* It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
* Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_legacy_params_t parameters);
/* Deprecation warnings */
/* It is generally possible to disable deprecation warnings from compiler,
for example with -Wno-deprecated-declarations for gcc
or _CRT_SECURE_NO_WARNINGS in Visual.
Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
#ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
# define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */
#else
# define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
# if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
# define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API
# elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message)))
# elif (ZDICT_GCC_VERSION >= 301)
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated))
# elif defined(_MSC_VER)
# define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message))
# else
# pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
# define ZDICT_DEPRECATED(message) ZDICTLIB_API
# endif
#endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
#endif /* ZDICT_STATIC_LINKING_ONLY */
#if defined (__cplusplus)
}
#endif
#endif /* DICTBUILDER_H_001 */