githaven-fork/vendor/github.com/ulikunitz/xz/lzma/hashtable.go

// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package lzma

import (
	"errors"
	"fmt"

	"github.com/ulikunitz/xz/internal/hash"
)

/* For compression we need to find byte sequences that match the byte
 * sequence at the dictionary head. A hash table is a simple method to
 * provide this capability.
 */

// maxMatches limits the number of matches requested from the Matches
// function. This controls the speed of the overall encoding.
const maxMatches = 16

// shortDists defines the number of short distances supported by the
// implementation.
const shortDists = 8

// The minimum is somehow arbitrary but the maximum is limited by the
// memory requirements of the hash table.
const (
	minTableExponent = 9
	maxTableExponent = 20
)

// newRoller contains the function used to create an instance of the
// hash.Roller.
var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }

// hashTable stores the hash table including the rolling hash method.
//
// We implement chained hashing into a circular buffer. Each entry in
// the circular buffer stores the delta distance to the next position with a
// word that has the same hash value.
type hashTable struct {
	dict *encoderDict
	// actual hash table
	t []int64
	// circular list data with the offset to the next word
	data  []uint32
	front int
	// mask for computing the index for the hash table
	mask uint64
	// hash offset; initial value is -int64(wordLen)
	hoff int64
	// length of the hashed word
	wordLen int
	// hash roller for computing the hash values for the Write
	// method
	wr hash.Roller
	// hash roller for computing arbitrary hashes
	hr hash.Roller
	// preallocated slices
	p         [maxMatches]int64
	distances [maxMatches + shortDists]int
}

// hashTableExponent derives the hash table exponent from the dictionary
// capacity.
func hashTableExponent(n uint32) int {
	e := 30 - nlz32(n)
	switch {
	case e < minTableExponent:
		e = minTableExponent
	case e > maxTableExponent:
		e = maxTableExponent
	}
	return e
}

// newHashTable creates a new hash table for words of length wordLen
func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
	if !(0 < capacity) {
		return nil, errors.New(
			"newHashTable: capacity must not be negative")
	}
	exp := hashTableExponent(uint32(capacity))
	if !(1 <= wordLen && wordLen <= 4) {
		return nil, errors.New("newHashTable: " +
			"argument wordLen out of range")
	}
	n := 1 << uint(exp)
	if n <= 0 {
		panic("newHashTable: exponent is too large")
	}
	t = &hashTable{
		t:       make([]int64, n),
		data:    make([]uint32, capacity),
		mask:    (uint64(1) << uint(exp)) - 1,
		hoff:    -int64(wordLen),
		wordLen: wordLen,
		wr:      newRoller(wordLen),
		hr:      newRoller(wordLen),
	}
	return t, nil
}

func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }

// buffered returns the number of bytes that are currently hashed.
func (t *hashTable) buffered() int {
	n := t.hoff + 1
	switch {
	case n <= 0:
		return 0
	case n >= int64(len(t.data)):
		return len(t.data)
	}
	return int(n)
}

// addIndex adds n to an index ensuring that is stays inside the
// circular buffer for the hash chain.
func (t *hashTable) addIndex(i, n int) int {
	i += n - len(t.data)
	if i < 0 {
		i += len(t.data)
	}
	return i
}

// putDelta puts the delta instance at the current front of the circular
// chain buffer.
func (t *hashTable) putDelta(delta uint32) {
	t.data[t.front] = delta
	t.front = t.addIndex(t.front, 1)
}

// putEntry puts a new entry into the hash table. If there is already a
// value stored it is moved into the circular chain buffer.
func (t *hashTable) putEntry(h uint64, pos int64) {
	if pos < 0 {
		return
	}
	i := h & t.mask
	old := t.t[i] - 1
	t.t[i] = pos + 1
	var delta int64
	if old >= 0 {
		delta = pos - old
		if delta > 1<<32-1 || delta > int64(t.buffered()) {
			delta = 0
		}
	}
	t.putDelta(uint32(delta))
}

// WriteByte converts a single byte into a hash and puts them into the hash
// table.
func (t *hashTable) WriteByte(b byte) error {
	h := t.wr.RollByte(b)
	t.hoff++
	t.putEntry(h, t.hoff)
	return nil
}

// Write converts the bytes provided into hash tables and stores the
// abbreviated offsets into the hash table. The method will never return an
// error.
func (t *hashTable) Write(p []byte) (n int, err error) {
	for _, b := range p {
		// WriteByte doesn't generate an error.
		t.WriteByte(b)
	}
	return len(p), nil
}

// getMatches the matches for a specific hash. The functions returns the
// number of positions found.
//
// TODO: Make a getDistances because that we are actually interested in.
func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
	if t.hoff < 0 || len(positions) == 0 {
		return 0
	}
	buffered := t.buffered()
	tailPos := t.hoff + 1 - int64(buffered)
	rear := t.front - buffered
	if rear >= 0 {
		rear -= len(t.data)
	}
	// get the slot for the hash
	pos := t.t[h&t.mask] - 1
	delta := pos - tailPos
	for {
		if delta < 0 {
			return n
		}
		positions[n] = tailPos + delta
		n++
		if n >= len(positions) {
			return n
		}
		i := rear + int(delta)
		if i < 0 {
			i += len(t.data)
		}
		u := t.data[i]
		if u == 0 {
			return n
		}
		delta -= int64(u)
	}
}

// hash computes the rolling hash for the word stored in p. For correct
// results its length must be equal to t.wordLen.
func (t *hashTable) hash(p []byte) uint64 {
	var h uint64
	for _, b := range p {
		h = t.hr.RollByte(b)
	}
	return h
}

// Matches fills the positions slice with potential matches. The
// functions returns the number of positions filled into positions. The
// byte slice p must have word length of the hash table.
func (t *hashTable) Matches(p []byte, positions []int64) int {
	if len(p) != t.wordLen {
		panic(fmt.Errorf(
			"byte slice must have length %d", t.wordLen))
	}
	h := t.hash(p)
	return t.getMatches(h, positions)
}

// NextOp identifies the next operation using the hash table.
//
// TODO: Use all repetitions to find matches.
func (t *hashTable) NextOp(rep [4]uint32) operation {
	// get positions
	data := t.dict.data[:maxMatchLen]
	n, _ := t.dict.buf.Peek(data)
	data = data[:n]
	var p []int64
	if n < t.wordLen {
		p = t.p[:0]
	} else {
		p = t.p[:maxMatches]
		n = t.Matches(data[:t.wordLen], p)
		p = p[:n]
	}

	// convert positions in potential distances
	head := t.dict.head
	dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
	for _, pos := range p {
		dis := int(head - pos)
		if dis > shortDists {
			dists = append(dists, dis)
		}
	}

	// check distances
	var m match
	dictLen := t.dict.DictLen()
	for _, dist := range dists {
		if dist > dictLen {
			continue
		}

		// Here comes a trick. We are only interested in matches
		// that are longer than the matches we have been found
		// before. So before we test the whole byte sequence at
		// the given distance, we test the first byte that would
		// make the match longer. If it doesn't match the byte
		// to match, we don't to care any longer.
		i := t.dict.buf.rear - dist + m.n
		if i < 0 {
			i += len(t.dict.buf.data)
		}
		if t.dict.buf.data[i] != data[m.n] {
			// We can't get a longer match. Jump to the next
			// distance.
			continue
		}

		n := t.dict.buf.matchLen(dist, data)
		switch n {
		case 0:
			continue
		case 1:
			if uint32(dist-minDistance) != rep[0] {
				continue
			}
		}
		if n > m.n {
			m = match{int64(dist), n}
			if n == len(data) {
				// No better match will be found.
				break
			}
		}
	}

	if m.n == 0 {
		return lit{data[0]}
	}
	return m
}
Dump: add output format tar and output to stdout (#10376) * Dump: Use mholt/archive/v3 to support tar including many compressions Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: Allow dump output to stdout Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: Fixed bug present since #6677 where SessionConfig.Provider is never "file" Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: never pack RepoRootPath, LFS.ContentPath and LogRootPath when they are below AppDataPath Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: also dump LFS (fixes #10058) Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: never dump CustomPath if CustomPath is a subdir of or equal to AppDataPath (fixes #10365) Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Use log.Info instead of fmt.Fprintf Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * import ordering * make fmt Co-authored-by: zeripath <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> Co-authored-by: Matti R <matti@mdranta.net> 2020-06-05 20:47:39 +00:00			`// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package lzma`

			`import (`
			`"errors"`
			`"fmt"`

			`"github.com/ulikunitz/xz/internal/hash"`
			`)`

			`/* For compression we need to find byte sequences that match the byte`
			`* sequence at the dictionary head. A hash table is a simple method to`
			`* provide this capability.`
			`*/`

			`// maxMatches limits the number of matches requested from the Matches`
			`// function. This controls the speed of the overall encoding.`
			`const maxMatches = 16`

			`// shortDists defines the number of short distances supported by the`
			`// implementation.`
			`const shortDists = 8`

			`// The minimum is somehow arbitrary but the maximum is limited by the`
			`// memory requirements of the hash table.`
			`const (`
			`minTableExponent = 9`
			`maxTableExponent = 20`
			`)`

			`// newRoller contains the function used to create an instance of the`
			`// hash.Roller.`
			`var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }`

			`// hashTable stores the hash table including the rolling hash method.`
			`//`
			`// We implement chained hashing into a circular buffer. Each entry in`
			`// the circular buffer stores the delta distance to the next position with a`
			`// word that has the same hash value.`
			`type hashTable struct {`
			`dict *encoderDict`
			`// actual hash table`
			`t []int64`
			`// circular list data with the offset to the next word`
			`data []uint32`
			`front int`
			`// mask for computing the index for the hash table`
			`mask uint64`
			`// hash offset; initial value is -int64(wordLen)`
			`hoff int64`
			`// length of the hashed word`
			`wordLen int`
			`// hash roller for computing the hash values for the Write`
			`// method`
			`wr hash.Roller`
			`// hash roller for computing arbitrary hashes`
			`hr hash.Roller`
			`// preallocated slices`
			`p [maxMatches]int64`
			`distances [maxMatches + shortDists]int`
			`}`

			`// hashTableExponent derives the hash table exponent from the dictionary`
			`// capacity.`
			`func hashTableExponent(n uint32) int {`
			`e := 30 - nlz32(n)`
			`switch {`
			`case e < minTableExponent:`
			`e = minTableExponent`
			`case e > maxTableExponent:`
			`e = maxTableExponent`
			`}`
			`return e`
			`}`

			`// newHashTable creates a new hash table for words of length wordLen`
			`func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {`
			`if !(0 < capacity) {`
			`return nil, errors.New(`
			`"newHashTable: capacity must not be negative")`
			`}`
			`exp := hashTableExponent(uint32(capacity))`
			`if !(1 <= wordLen && wordLen <= 4) {`
			`return nil, errors.New("newHashTable: " +`
			`"argument wordLen out of range")`
			`}`
			`n := 1 << uint(exp)`
			`if n <= 0 {`
			`panic("newHashTable: exponent is too large")`
			`}`
			`t = &hashTable{`
			`t: make([]int64, n),`
			`data: make([]uint32, capacity),`
			`mask: (uint64(1) << uint(exp)) - 1,`
			`hoff: -int64(wordLen),`
			`wordLen: wordLen,`
			`wr: newRoller(wordLen),`
			`hr: newRoller(wordLen),`
			`}`
			`return t, nil`
			`}`

			`func (t hashTable) SetDict(d encoderDict) { t.dict = d }`

			`// buffered returns the number of bytes that are currently hashed.`
			`func (t *hashTable) buffered() int {`
			`n := t.hoff + 1`
			`switch {`
			`case n <= 0:`
			`return 0`
			`case n >= int64(len(t.data)):`
			`return len(t.data)`
			`}`
			`return int(n)`
			`}`

			`// addIndex adds n to an index ensuring that is stays inside the`
			`// circular buffer for the hash chain.`
			`func (t *hashTable) addIndex(i, n int) int {`
			`i += n - len(t.data)`
			`if i < 0 {`
			`i += len(t.data)`
			`}`
			`return i`
			`}`

			`// putDelta puts the delta instance at the current front of the circular`
			`// chain buffer.`
			`func (t *hashTable) putDelta(delta uint32) {`
			`t.data[t.front] = delta`
			`t.front = t.addIndex(t.front, 1)`
			`}`

			`// putEntry puts a new entry into the hash table. If there is already a`
			`// value stored it is moved into the circular chain buffer.`
			`func (t *hashTable) putEntry(h uint64, pos int64) {`
			`if pos < 0 {`
			`return`
			`}`
			`i := h & t.mask`
			`old := t.t[i] - 1`
			`t.t[i] = pos + 1`
			`var delta int64`
			`if old >= 0 {`
			`delta = pos - old`
			`if delta > 1<<32-1 \|\| delta > int64(t.buffered()) {`
			`delta = 0`
			`}`
			`}`
			`t.putDelta(uint32(delta))`
			`}`

			`// WriteByte converts a single byte into a hash and puts them into the hash`
			`// table.`
			`func (t *hashTable) WriteByte(b byte) error {`
			`h := t.wr.RollByte(b)`
			`t.hoff++`
			`t.putEntry(h, t.hoff)`
			`return nil`
			`}`

			`// Write converts the bytes provided into hash tables and stores the`
			`// abbreviated offsets into the hash table. The method will never return an`
			`// error.`
			`func (t *hashTable) Write(p []byte) (n int, err error) {`
			`for _, b := range p {`
			`// WriteByte doesn't generate an error.`
			`t.WriteByte(b)`
			`}`
			`return len(p), nil`
			`}`

			`// getMatches the matches for a specific hash. The functions returns the`
			`// number of positions found.`
			`//`
			`// TODO: Make a getDistances because that we are actually interested in.`
			`func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {`
			`if t.hoff < 0 \|\| len(positions) == 0 {`
			`return 0`
			`}`
			`buffered := t.buffered()`
			`tailPos := t.hoff + 1 - int64(buffered)`
			`rear := t.front - buffered`
			`if rear >= 0 {`
			`rear -= len(t.data)`
			`}`
			`// get the slot for the hash`
			`pos := t.t[h&t.mask] - 1`
			`delta := pos - tailPos`
			`for {`
			`if delta < 0 {`
			`return n`
			`}`
			`positions[n] = tailPos + delta`
			`n++`
			`if n >= len(positions) {`
			`return n`
			`}`
			`i := rear + int(delta)`
			`if i < 0 {`
			`i += len(t.data)`
			`}`
			`u := t.data[i]`
			`if u == 0 {`
			`return n`
			`}`
			`delta -= int64(u)`
			`}`
			`}`

			`// hash computes the rolling hash for the word stored in p. For correct`
			`// results its length must be equal to t.wordLen.`
			`func (t *hashTable) hash(p []byte) uint64 {`
			`var h uint64`
			`for _, b := range p {`
			`h = t.hr.RollByte(b)`
			`}`
			`return h`
			`}`

			`// Matches fills the positions slice with potential matches. The`
			`// functions returns the number of positions filled into positions. The`
			`// byte slice p must have word length of the hash table.`
			`func (t *hashTable) Matches(p []byte, positions []int64) int {`
			`if len(p) != t.wordLen {`
			`panic(fmt.Errorf(`
			`"byte slice must have length %d", t.wordLen))`
			`}`
			`h := t.hash(p)`
			`return t.getMatches(h, positions)`
			`}`

			`// NextOp identifies the next operation using the hash table.`
			`//`
			`// TODO: Use all repetitions to find matches.`
			`func (t *hashTable) NextOp(rep [4]uint32) operation {`
			`// get positions`
			`data := t.dict.data[:maxMatchLen]`
			`n, _ := t.dict.buf.Peek(data)`
			`data = data[:n]`
			`var p []int64`
			`if n < t.wordLen {`
			`p = t.p[:0]`
			`} else {`
			`p = t.p[:maxMatches]`
			`n = t.Matches(data[:t.wordLen], p)`
			`p = p[:n]`
			`}`

			`// convert positions in potential distances`
			`head := t.dict.head`
			`dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)`
			`for _, pos := range p {`
			`dis := int(head - pos)`
			`if dis > shortDists {`
			`dists = append(dists, dis)`
			`}`
			`}`

			`// check distances`
			`var m match`
			`dictLen := t.dict.DictLen()`
			`for _, dist := range dists {`
			`if dist > dictLen {`
			`continue`
			`}`

			`// Here comes a trick. We are only interested in matches`
			`// that are longer than the matches we have been found`
			`// before. So before we test the whole byte sequence at`
			`// the given distance, we test the first byte that would`
			`// make the match longer. If it doesn't match the byte`
			`// to match, we don't to care any longer.`
			`i := t.dict.buf.rear - dist + m.n`
			`if i < 0 {`
			`i += len(t.dict.buf.data)`
			`}`
			`if t.dict.buf.data[i] != data[m.n] {`
			`// We can't get a longer match. Jump to the next`
			`// distance.`
			`continue`
			`}`

			`n := t.dict.buf.matchLen(dist, data)`
			`switch n {`
			`case 0:`
			`continue`
			`case 1:`
			`if uint32(dist-minDistance) != rep[0] {`
			`continue`
			`}`
			`}`
			`if n > m.n {`
			`m = match{int64(dist), n}`
			`if n == len(data) {`
			`// No better match will be found.`
			`break`
			`}`
			`}`
			`}`

			`if m.n == 0 {`
			`return lit{data[0]}`
			`}`
			`return m`
			`}`