(* $Id: seqdb_containers.mli 16180 2008-01-18 20:57:28Z gerd $ *) (** Persistent containers *) (** The {!Seqdb_containers.Superblock} module reads and writes the superblock of a Kvseq, Hindex, or Perm file. The superblock is always at the beginning of the file and has a fixed but configurable size. It is a good idea to make the superblock big enough that there is enough space for variables that might ever be added to the block. A variable needs 16 bytes. *) module Superblock : sig type t val create_superblock : sbsize:int -> format:int64 -> purpose:string -> unit -> t (** Creates a new empty superblock with the passed parameters *) val read_superblock : Seqdb_rdwr.file_descr -> t (** Reads the superblock from the file. The file position changes. *) val read_superblock_from_rw : Seqdb_rdwr.reader_writer -> t (** Reads the superblock from the reader/writer. The file position changes. *) val write_superblock : Seqdb_rdwr.file_descr -> t -> unit (** Writes the superblock to the file. The file position changes. *) val write_superblock_to_rw : Seqdb_rdwr.reader_writer -> t -> unit (** Writes the superblock to the reader/writer. The file position changes. *) val variable : t -> string -> int64 (** Get a superblock variable (or [Not_found]) *) val variables : t -> (string * int64) list (** Get all superblock variables *) val set_variable : t -> string -> int64 -> unit (** Set a superblock variable. It is not possible to change [SBSIZE] *) val filesize : t -> Seqdb_rdwr.file_descr -> int64 (** Return the logical file size: The [FILESIZE] variable if existent, or else the real file size of the descriptor. Fails if [FILESIZE] is bigger than the real file size. *) val sbsize : t -> int (** Return the superblock size *) end (** Descriptions of the variables that may occur in superblocks can be found in the following [Sb_consts] module. *) module Sb_consts : sig (** Common constants used in superblocks *) (** Magic: *) val winkme : string (** This is the "#!WINKME" magic at the beginning of the superblock *) (** Names of superblock variables: *) val sbsize_name : string (** SBSIZE is the size of the superblock in bytes *) val format_name : string (** FORMAT says which format the file has: One of {!Seqdb_containers.Sb_consts.kvseq_format}, {!Seqdb_containers.Sb_consts.hindex_format}, or {!Seqdb_containers.Sb_consts.perm_format}. *) val purpose_name : string (** PURPOSE is a string of 8 bytes that gives a hint for what the file is used. The strings "FSYSDATA", "FSYSIDX", and "FSYSTIME" are used for file systems. *) val filesize_name : string (** FILESIZE is the logical length of the file. That means only the range from 0 to FILESIZE-1 is considered as the part of the file that contains valid data. The range FILESIZE to the real end is seen as allocated but free extension space. FILESIZE is used for all formats. As Hindex files have a fixed length, it does not make sense to allocate space beyond FILESIZE, however. *) val fileincr_name : string (** FILEINCR is the increment by which the file is extended when the file is full. Values of several megs prevent that the file gets fragmented on the disk too much *) val syncsize_name : string (** SYNCSIZE is the logical length of the file at the time of the last checkpoint. SYNCSIZE must not be larger than FILESIZE. The range SYNCSIZE to FILESIZE-1 is considered as possibly corrupted. SYNCSIZE only applies to Kvseq files. *) val synctime_name : string (** SYNCTIME is the timestamp (as seconds since the epoch) of the last checkpoint. SYNCTIME only applies to Kvseq files. *) val keyrepr_name : string (** KEYREPR says how the keys in a Kvseq file are represented. *) val valrepr_name : string (** VALREPR says how the values in a Kvseq file are represented. *) val kvdelfl_name : string (** KVDELFL says whether there is a delete flag in every entry of a Kvseq file. *) val align_name : string (** ALIGN enforces the alignment of the Kvseq entries so entries can only start at multiples of ALIGN. *) val entries_name : string (** ENTRIES is the total number of entries in the file (deleted and active entries). ENTRIES is only meaningful for Kvseq and Hindex. *) val aentries_name : string (** AENTRIES is the number of active (non-deleted) entries in the file. AENTRIES is only meaningful for Kvseq and Hindex. *) val htalgo_name : string (** HTALGO sets the hash algorithm. Single allowed value is {!Seqdb_containers.Sb_consts.md5_algo}. *) val htsize_name : string (** HTSIZE is the size of the Hindex table as number of elements *) val cellsz_name : string (** CELLSZ is the size of the Hindex elements in the number of 8 byte words. Conventionally, the sizes 1 and 2 mean when the Hindex is used together with a Kvseq file: - A size of 1 word means that the single word contains the pointer to the corresponding Kvseq entry - A size of 2 words means that the first word contains the pointer to the corresponding Kvseq entry, and the second word is a hash value of the key of the entry. This is also known as "stored hashes" variant. This variant is advantageous because it is no longer needed to read the key from the Kvseq entry in order to check whether the hash cell really points to the entry or is just a hash collision. *) val permsize_name : string (** PERMSIZE is the size of the Perm array in number of elements *) (** Superblock variables only meaningful in file systems: *) val havedups_name : string (** HAVEDUPS is used in the superblocks of the data files of file systems. If true, it is allowed that the data files contain several inodes for the same file. Only the last inode is to be seen as valid, however. The previous inodes are to be considered as deleted although the delete flag needs not to be set. HAVEDUPS is a boolean. In HAVEDUPS mode it is allowed to call the [delete_name_from_index] method of file systems. This method does not require a write to the data file in order to delete files. The price is, however, that iterating over the files of the system is a lot more costly. *) val isz_name : string (** ISZ is the default size for new inodes (existing inodes are allowed to have a different size). This variable occurs in the data file of file systems. *) val itotsz_name : string (** ITOTSZ is the total size of all active (non-deleted) inodes in bytes This variable occurs in the data file of file systems. *) val dtotsz_name : string (** DTOTSZ is the estimated total size of all active (non-deleted) files in bytes. This variable occurs in the data file of file systems. *) val timemp_name : string (** TIMEMP says after how many seconds a new time mark is written. This variable occurs in the time index file of file systems. *) val timemark_name : string (** TIMEMARK is the value of the last written time mark. This variable occurs in the time index file of file systems. *) val timemn_name : string (** TIMEMN is the number of appended files since the last time mark was written. This variable occurs in the time index file of file systems. *) val lastmt_name : string (** LASTMT is the mtime timestamp of the last appended file *) (** Values for [FORMAT]: *) val kvseq_format : int64 val hindex_format : int64 val perm_format : int64 (** Values for [DELFLAG], [KVDELFL], [HAVEDUPS]: *) val false_bool : int64 val true_bool : int64 (** Values for [KEYREPR] and [VALREPR]: *) val int8_repr : int64 (** The length of keys/values is stored in 1 byte. Allows lengths in the range 0 to 255. *) val int16_repr : int64 (** The length of keys/values is stored in 2 bytes. Allows lengths in the range 0 to 65535. *) val int32_repr : int64 (** The length of keys/values is stored in 4 bytes. Allows lengths in the range 0 to 4294967295. *) val int64_repr : int64 (** The length of keys/values is stored in 8 bytes. Allows practically unlimited length. *) val fixed_repr_min : int64 val fixed_repr_max : int64 (** If [KEYREPR]/[VALREPR] is a value [n] between [fixed_repr_min] and [fixed_repr_max], the length of the keys/values is fixed to [n - fixed_repr_min]. *) val lim8_repr_min : int64 val lim8_repr_max : int64 (** If [KEYREPR]/[VALREPR] is a value [n] between [lim8_repr_min] and [lim8_repr_max], a fixed space of [n - lim8_repr_min + 1] is reserved for the key/value string, and the string can have a variable size from 0 to [n - lim8_repr_min + 1]. "Variable" means that existing keys/values can be overwritten with any new string in the size range. *) (** Values for [HTALGO]: *) val md5_algo : int64 (** Based on MD5 *) end (** The [Hash_algo] module defines the available hash algorithm - which is currently only one. *) module Hash_algo : sig type hash_algo = [ `MD5 ] val hash_algo_of_htalgo : int64 -> hash_algo val htalgo_of_hash_algo : hash_algo -> int64 (** Convert from/to HTALGO superblock values *) val hash : hash_algo -> string -> int64 (** The hash value of a key must fit in a positive int64 number *) val other_hash : hash_algo -> string -> int64 (** Another hash value computed from the key *) end (** A module of type [POINTABLE] describes a container with entries that can be pointed to. {!Seqdb_containers.KVSEQ} is an example of a [POINTABLE] container. *) module type POINTABLE = sig (** A pointable container has pointers to its entries *) type t (** The container *) type entry (** Entries of the container *) type pointer (** Pointers to the entries *) val string_of_pointer : pointer -> string val pointer_of_string : string -> pointer val pointer_length : int (** Convert pointer to/from strings. The strings have all the same length, namely [pointer_length]. Usually, the strings have a length of 8 (i.e. a long file pointer) or 16 (a file pointer plus a hash value). This corresponds to the [CELLSZ] superblock variable. *) val validate_pointer : t -> pointer -> bool (** Checks whether the pointer is in the valid range. Returns [true] if so *) val get_pointer : entry -> pointer (** Get the pointer of an entry *) val lookup : t -> pointer -> entry (** Get an entry by looking up a pointer, or raise [Not_found] *) end (** A module of type [HASHABLE] describes a container where the entries can be hashed, i.e. looked up by some kind of hash value. *) module type HASHABLE = sig (** A hashtable container can find its entries by hashes *) include POINTABLE val get_key : entry -> string (** Get the key of the entry *) val has_key : entry -> string -> bool (** Checks whether the entry has the key. This might be cheaper than comparing the result of [get_key] with the given key. *) val suggested_hash_algo : t -> Hash_algo.hash_algo option (** Get the suggested hash algorithm, if any *) val free_mark : string val del_mark : string (** The pointer values marking free and deleted hash table cells *) end (** The module type [KVSEQ] is the output type of the [Kvseq] implementation, that means it describes all the functions to read/write Kvseq files. A [KVSEQ] module is also [POINTABLE] and [HASHABLE]. *) module type KVSEQ = sig type t (** The kvseq file *) type entry (** A live entry in a kvseq file *) type pointer (** A pointer to a live entry *) type contents = { delflag : bool; (** The delete flag (if supported, [false] otherwise) *) key : string; (** The keys are arbitrary strings of their [repr_class] *) value : string (** The values are arbitrary strings of their [repr_class] *) } type repr_class = [ `Int8 | `Int16 | `Int32 | `Int64 | `Fixed of int | `Lim8 of int ] (** Representation class: - [`Int8]: Strings up to a length of 255 bytes - [`Int16]: Strings up to a length of 65535 bytes - [`Int32]: Strings up to a length of 2^32-1 bytes - [`Int64]: Strings up to a length of 2^63-1 bytes (signed!) - [`Fixed n]: Strings with a length of exactly [n] bytes. 0 <= n <= 255. - [`Lim8 n]: Strings up to a length of [n] bytes where [n] is at most 255. For the string [n] bytes are allocated for the string, so it can be later changed in length. *) (** {b File descriptors.} In order to open a Kvseq file one has to call [create] or [access]. Both functions require a {!Seqdb_rdwr.file_descr} object as input argument. This object allows detailed control over the lifetime of the Unix file descriptor. This object has the method [file_descr] which must simply return the open file descriptor of the file, and [dispose_hint] which may optionally be interpreted by the object. A simple version of this object would be: {[ let fd = Unix.openfile filename [ Unix.O_RDWR ] 0 in object method file_descr = fd method dispose_hint() = () end ]} Generally, one has to call the {!Seqdb_containers.KVSEQ.flush} function to ensure that all data are written to the file before closing the descriptor. So it would be legal to do: {[ flush kvseq; Unix.close fd ]} However, one has to be careful not to use [kvseq] anymore if the file descriptor object is that simple as shown, because the [file_descr] method would now return the closed descriptor. By using cleverer file descriptor objects it is possible to continue to access [kvseq] after flushing. See {!Seqdb_rdwr.file_descr} and {!Seqdb_rdwr.managed_descr} for details. *) val create : ?buffer_size:int -> ?chunk_size:int -> ?sbsize:int -> ?fileincr:int64 -> ?supports_deletions:bool -> ?keyrepr:repr_class -> ?valrepr:repr_class -> ?alignment:int -> ?have_statistics:bool -> ?suggested_hash_algo:Hash_algo.hash_algo -> ?purpose:string -> Seqdb_rdwr.file_descr -> t (** Write an empty kvseq structure into the file referenced by the file descriptor. The parameters mean (but see also {!Seqdb_containers.Sb_consts} and {!Seqdb_formats}): - [sbsize] is the size of the superblock to write (512 by default) - [fileincr] is by how many bytes full files are extended if more space is needed (4M by default) - [supports_deletions] is whether the entries have the delete flag ([true] by default) - [keyrepr] is how the keys are represented ([`Int64] by default) - [valrepr] is how the values are represented ([`Int64] by default) - [alignment] is whether an alignment constraint applies (not set by default) - [have_statistics] says whether a statistics about the number of entries and their size in bytes is maintained in superblock variables (ENTRIES, and AENTRIES - [true] by default). - [suggested_hash_algo] is always [`MD5] - [purpose] is a string up to 8 chars describing the purpose of the file The above parameters are saved in the superblock. Only [purpose] abd [fileincr] can be changed later by modifying the superblock. There are also parameters configuring the access layer. These are only valid as long as the file is accessed, and can be set to different values every time the file is opened: - [buffer_size]: the size of the RAM buffer. The buffer is split up into chunks of [chunk_size], and every chunk may point to a different area in the file. A value less than [chunk_size] means ad-hoc buffering of up to [chunk_size] bytes. (Default: 0) - [chunk_size]: the size of an individual buffer chunk. (Default: 16K) *) val access : ?buffer_size:int -> ?chunk_size:int -> ?conservative:bool -> Seqdb_rdwr.file_descr -> t (** Access the kvseq file referenced by the file descriptor. - [conservative]: If true, the logical file length is assumed to be only [SYNCSIZE] and not [FILESIZE], i.e. a rollback is done if [SYNCSIZE < FILESIZE]. You can test this special condition after [access] with [rollback_flag] (below). The actual rollback is not immediately done, but at the next good opportunity. You can enforce it by calling [sync]. (Default: [conservative=false]) - [buffer_size]: see {!Seqdb_containers.KVSEQ.create} - [chunk_size]: see {!Seqdb_containers.KVSEQ.create} *) val superblock : t -> Superblock.t (** Get the superblock. Note that if you modify the superblock, it is not automatically written back unless you also call [mark_superblock_as_dirty]. *) val mark_superblock_as_dirty : t -> unit (** The superblock is marked as dirty, and will be written out at the next good opportunity *) val rollback_flag : t -> bool (** After opening the kvseq with [access], this flag is [true] if a rollback to the last synchronized file size is to be done. See also the description for [access]. The flag is reset when the rollback has been carried out. *) val configure : ?flush_every:int -> ?auto_sync:int option -> ?auto_fadvise:bool -> ?onsync:(unit -> unit) -> t -> unit (** Sets some (non-persistent) parameters: - [flush_every]: The superblock is written every this number of adds, deletes, replaces. (Default: 1) - [auto_sync]: If non-[None], every [auto_sync] seconds the whole file is synced to disk. This also sets the superblock variables [SYNCSIZE] and [SYNCTIME]. A value of 0 means: sync after every modification. (Default: Some 0 - {b you want to change this}) - [auto_fadvise]: Advises to remove the file from the page cache after every sync (automatic syncs & explicit syncs). The superblock is not removed from the cache. (Default: false) - [onsync]: This function is called before an automatic or explicit sync is done. For example, one can sync the attached Hindex at that time. (Default: do nothing) *) val get_pointer : entry -> pointer (** Get the pointer of an entry *) val get_contents : entry -> contents (** Get the contents of an entry *) val get_key : entry -> string (** Get only the key of the entry *) val has_key : entry -> string -> bool (** Checks whether the entry has the key *) val get_value : entry -> string (** Get only the value of the entry *) val get_value_length : entry -> int64 (** Get the length of the value *) val get_total_length : entry -> int64 (** Get the total length (used space) of the entry *) val get_delflag : entry -> bool (** Get only the delflag of the entry *) val lookup : t -> pointer -> entry (** Get an entry by looking up a pointer *) val validate_pointer : t -> pointer -> bool (** See [POINTABLE] *) val add : t -> contents -> entry (** Add another value to the file, and return the new entry. Fails if [delflag] is true, but the file does not support deletions. *) val replace : entry -> contents -> unit (** Replace the value stored inside an entry with a new version. The new version must have the same size except it happens to be the last entry of the file. Fails otherwise. Also fails if [delflag] is true, but the file does not support deletions. *) val rename : entry -> string -> unit (** Rename the entry. The resulting new key must have the same size on disk as the old one. *) val delete : entry -> unit (** Same as replacing the entry with a deleted entry *) val blit_to_string : entry -> int64 -> string -> int -> int -> unit (** [blit_to_string e e_pos s s_pos len]: Copies the substring of length [len] at position [e_pos] from [e]'s value to [s] at position [s_pos]. *) val blit_from_string : string -> int -> entry -> int64 -> int -> unit (** [blit_from_string s s_pose e_pos len]: Copies the substring of length [len] at position [s_pos] from [s] to [e]'s value at position [e_pos]. Generally it is allowed that the value becomes longer by this operation. However, the same restriction as for [replace] applies: Unless the entry is the last, the length of the value must not be changed. *) val flush : t -> unit (** Ensure that everything is written out (but a sync is not forced). Furthermore, it is ensured that all file descriptors are forgotten about. *) val sync : t -> unit (** Ensure that everything is physically written to disk (implies [flush]). Also sets the superblock variables [SYNCSIZE] and [SYNCTIME]. *) val first_entry : t -> entry (** Returns the first entry, or raises [End_of_file] *) val next_entry : entry -> entry (** Returns the next entry of a given entry, or raises [End_of_file] *) val recover_entry : Pcre.regexp -> t -> pointer option -> entry (** This is a recovery function for reading damaged files. It tries to find the next valid entry by investigating the file after the pointer on byte level (if no pointer is given, it tries to find the first valid entry of the file). The function only accepts entries whose keys match the passed regular expression. The idea is to call it with the pointer of the last readable entry in order to skip damaged regions in the file and to find the next valid entry after that. Raises [End_of_file] if nothing can be found. *) val string_of_pointer : pointer -> string val pointer_of_string : string -> pointer val int64_of_pointer : pointer -> int64 val pointer_length : int (** Convert pointer to/from string, and int64. [pointer_length = 8]. *) val keyrepr : t -> repr_class val valrepr : t -> repr_class val supports_deletions : t -> bool val alignment : t -> int val have_statistics : t -> bool val suggested_hash_algo : t -> Hash_algo.hash_algo option (** Query features of the file *) val num_entries : t -> int64 val num_active_entries : t -> int64 (** Get statistics. [Not_found] if not available *) val free_mark : string val del_mark : string (** The pointer values marking free and deleted hash table cells *) val fadvise_wontneed : t -> unit (** Tell the page cache that we won't need this file any more. Note that when writing, only synced pages are affected. The superblock is excluded from the advice. *) val fadvise_iterating : t -> unit (** Tell the page cache that we are iterating over the file, and it is a good idea to read ahead pages. Additionally, pages that are no longer useful are removed from the page cache. It is sufficient to call this function once before starting the iteration. This mode is turned off when reaching EOF or when [fadvise_wontneed] is called. *) val fadvise_willneed : t -> int64 -> int64 -> unit (** Advise to load the size bytes at pointer *) (** General note about fadvise: The library only uses [FADV_WILLNEED] and [FADV_DONTNEED]. When looking up an entry, the pages are read ahead if this looks useful (with [FADV_WILLNEED]). This is done anyway without needing any hint from the caller. The [fadvise_iterating] mode is implemented by giving [FADV_WILLNEED] and [FADV_DONTNEED] hints at the right moments. Note that by default the underlying device also does a read-ahead. This library does not depend on this function, however. Linux allows it to turn off this implied read-ahead by fadvising [FADV_RANDOM]. This is not done by this library, but may be a good idea to do when only random lookups are expected. In this case, the user of this library should do it. *) end module Kvseq : KVSEQ (** The basic version of Kvseq where pointers are plain file positions. This implementation has to be used when the [CELLSZ] of the Hindex file is 1. *) module Hpointer (C:HASHABLE) : sig include HASHABLE with type t = C.t val downgrade : entry -> C.entry val upgrade : C.entry -> entry val downgrade_pointer : pointer -> C.pointer end (** This functor adds a hash value of the key to every pointer. The pointers are now 8 bytes longer, and the additional 8 bytes contain the hash of the keys. The [entry] type is a hashed pointer with an optional inner [C.entry]. The [downgrade] function enforces to create the inner [C.entry], and performs disk lookups. The reverse [upgrade] function create an [entry] from a given [C.entry]. The [lookup] function does not access the disk, but simply returns an [entry] without inner [C.entry]. First the next [downgrade] will perform the disk access. The [has_key] function does not access the disk, too. *) module Kvseq_hp : KVSEQ (** A version of [KVSEQ] using pointers with key hashes. This implementation has to be used when the [CELLSZ] of the Hindex file is 2. *) (** Output type of [Hindex], that means it describes all the functions to read/write Hindex files. *) module type HINDEX = sig type t (** The hindex file *) type entry (** A live entry in an hindex file *) module Container : HASHABLE (** The indexed container *) type contents = Container.entry (** Contents of the container entries *) val create : ?buffer_size:int -> ?chunk_size:int -> ?sbsize:int -> ?htsize:int64 -> ?cellsz:int -> ?have_statistics:bool -> ?hash_algo:Hash_algo.hash_algo -> ?purpose:string -> Container.t -> Seqdb_rdwr.file_descr -> t (** Write an empty hindex structure into the file referenced by the file descriptor. The parameters mean (but see also {!Seqdb_containers.Sb_consts} and {!Seqdb_formats}): - [sbsize] is the size of the superblock to write (512 by default) - [htsize] is the size of the hash table. (Default: 1M) - [cellsz] is the size of the table cells in words (default: [Container.pointer_length / 8] - [have_statistics] says whether a statistics about the number of entries and their size in bytes is maintained in superblock variables (ENTRIES, and AENTRIES - [true] by default). - [hash_algo] is always [`MD5] - [purpose] is a string up to 8 chars describing the purpose of the file The above parameters are saved in the superblock. Only [purpose] can be changed later by modifying the superblock. Note that [htsize] has to be chosen carefully. It is the maximum number of entries the index can have. The table will not grow automatically if that maximum is reached; the administrator will have to recreate the index file with a larger [htsize] in this case. Also note that it is unwise to fill the table close to its maximum because the performance decreases dramatically. Filling it to 50-80% is acceptable. There are also parameters configuring the access layer. These are only valid as long as the file is accessed, and can be set to different values every time the file is opened: - [buffer_size]: the size of the RAM buffer. The buffer is split up into chunks of [chunk_size], and every chunk may point to a different area in the file. A value less than [chunk_size] means ad-hoc buffering of up to [chunk_size] bytes. (Default: 0) - [chunk_size]: the size of an individual buffer chunk. (Default: 16K) *) val access : ?fully_buffered_index:bool -> ?buffer_size:int -> ?chunk_size:int -> ?superblock:Superblock.t -> Container.t -> Seqdb_rdwr.file_descr -> t (** Access the hindex file referenced by the file descriptor - [fully_buffered_index]: If true, the buffer for the index will be made large enough that the whole index fits into it. (Default: false) - [superblock]: Only internally used - [buffer_size]: see [create] - [chunk_size]: see [create] *) val superblock : t -> Superblock.t (** Get the superblock *) val mark_superblock_as_dirty : t -> unit (** The superblock is marked as dirty, and will be written out at the next good opportunity *) val configure : ?flush_every:int -> ?auto_fadvise:bool -> ?random_fadvise:bool -> t -> unit (** Sets some (non-persistent) parameters: - [flush_every]: The superblock is written every this number of adds, replaces, deletes. (Default: 1) - [auto_fadvise]: Advises to remove the file from the page cache after every sync (automatic syncs & explicit syncs). The superblock is not removed. (Default: false) - [random_fadvise]: Whether to advise the OS to assume fully random accesses (Default: false) *) val get_pointer : entry -> Container.pointer (** Get the pointer to the container entry (or [Not_found] if it is deleted) *) val get_contents : entry -> Container.entry (** Get the contents of an entry (or Not_found if it is deleted) *) val lookup : t -> string -> entry (** Get an entry by looking up a key, or raise Not_found *) val pointer_hint : t -> string -> Container.pointer (** Get the most likely position of the key in the container. This function only accesses the index, and not the container. May return [Not_found] if an evidence is found that the key does not exist. Note that the function returns wrong values with some (low) likelihood. It is intended to order random accesses for speed. *) val index_hint : t -> string -> int64 (** Get the most likely index position (in cell numbers) of the key *) val add : t -> Container.entry -> entry (** Add another value to the file, and return the new entry. If the same key is added twice, this is not rejected, but there is no way to access the second entry. *) val replace : entry -> Container.entry -> unit (** Replace the value stored inside an entry with a new version. The old and the new version must have the same key. *) val delete : entry -> unit (** Marks the entry as deleted *) val first_entry : t -> entry (** Returns the first entry, or raises [End_of_file] *) val next_entry : entry -> entry (** Returns the next entry of a given entry, or raises [End_of_file] *) val flush : t -> unit (** Ensure that everything is written out (but a sync is not forced). Furthermore, it is ensured that all file descriptors are forgotten about. *) val sync : t -> unit (** Ensure that everything is physically written to disk (implies [flush]) *) val have_statistics : t -> bool val hash_algo : t -> Hash_algo.hash_algo val htsize : t -> int64 (** Query features of the file *) val num_entries : t -> int64 val num_active_entries : t -> int64 (** Get statistics. Fail if not available *) val fadvise_wontneed : t -> unit (** Tell the page cache that we won't need this file any more. Note that when writing, only synced pages are affected. The superblock is excluded from the advice. *) end module Hindex (C : HASHABLE) : HINDEX with module Container = C (** Default implementation of [HINDEX]. Intended to be used together with [Kvseq]: {[ module HI = Hindex(Kvseq) ]} However, it is possible to use it on other types of [HASHABLE], too. *) module Autoindex (C : HASHABLE) : HINDEX with module Container = C (** This [HINDEX] implementation can cope with index files that have either [CELLSZ=1] or [CELLSZ=2]. In the first case, the hash table cells contain only pointers to the kvseq file - this is the same as [Hindex(Kvseq)]. In the latter case, stored hashes are assumed - this is the same as [Hindex(Hpointer(Kvseq))] [C.pointer_length = 8] is required for the input [HASHABLE]. This is the case for [Kvseq], so one can use: {[ module AI = Autoindex(Kvseq) ]} *) (** Note that the following file format, Perm, has not yet had the chance to mature. *) (** Output type of Perm *) module type PERM = sig type t (** The perm file *) type entry (** A live entry in a perm file *) module Container : HASHABLE (** The permuted container *) val create : ?sbsize:int -> ?hash_algo:Hash_algo.hash_algo -> ?fileincr:int64 -> ?purpose:string -> Container.t -> Seqdb_rdwr.file_descr -> t (** Write an empty perm structure into the file referenced by the file descriptor. *) val access : Container.t -> Seqdb_rdwr.file_descr -> t (** Access the perm file referenced by the file descriptor *) val superblock : t -> Superblock.t (** Get the superblock *) val mark_superblock_as_dirty : t -> unit (** The superblock is marked as dirty, and will be written out at the next good opportunity *) val configure : ?flush_every:int -> ?auto_fadvise:bool -> t -> unit (** Sets some (non-persistent) parameters: - [flush_every]: The superblock is written every this number of adds, deletes, replaces. (Default: 1) - [auto_fadvise]: Advises to remove the file from the page cache after every sync (automatic syncs & explicit syncs). The superblock is not removed. (Default: false) *) val size : t -> int64 (** The size of the permutation array in number of entries *) val add : t -> Container.entry -> entry (** Adds a new entry at the end of the permutation array It isn't checked whether the [Container.pointer] is already member of the array! *) val flush : t -> unit (** Ensure that everything is written out (but a sync is not forced). Furthermore, it is ensured that all file descriptors are forgotten about. *) val sync : t -> unit (** Ensure that everything is physically written to disk (implies [flush]) *) val get_contents : entry -> Container.entry (** Get the contents of an entry *) val get_index : entry -> int64 (** Get the index of an entry, [0 <= index < size] *) val lookup : t -> int64 -> entry (** Get an entry by looking up an index, [0 <= index < size] *) (* val swap : entry -> entry -> unit (* Swaps two entries *) *) val group : ?ext:(Seqdb_rdwr.file_descr * int) -> t -> unit (** Sorts the entries by first their hash value, and then by their keys. The result is a perm array where all entries for a key are adjacent. [ext]: If passed as [(fd, n)], an external sort is used with a chunk size of [n] (i.e. up to [n] entries are sorted in memory). [fd] is a file that serves as scratch space. {b EXTERNAL SORTING IS NOT YET IMPLEMENTED!} *) val hash_algo : t -> Hash_algo.hash_algo (** Query features of the file *) val fadvise_wontneed : t -> unit (** Tell the page cache that we won't need this file any more. Note that when writing, only synced pages are affected. The superblock is excluded from the advice. *) end module Perm (C : HASHABLE) : PERM with module Container = C (** Default implementation of the [PERM] modtype *)