* Dump: Use mholt/archive/v3 to support tar including many compressions Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: Allow dump output to stdout Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: Fixed bug present since #6677 where SessionConfig.Provider is never "file" Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: never pack RepoRootPath, LFS.ContentPath and LogRootPath when they are below AppDataPath Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: also dump LFS (fixes #10058) Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: never dump CustomPath if CustomPath is a subdir of or equal to AppDataPath (fixes #10365) Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Use log.Info instead of fmt.Fprintf Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * import ordering * make fmt Co-authored-by: zeripath <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> Co-authored-by: Matti R <matti@mdranta.net>
		
			
				
	
	
		
			401 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
			
		
		
	
	
			401 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
| // Copyright 2015, Joe Tsai. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style
 | |
| // license that can be found in the LICENSE.md file.
 | |
| 
 | |
| // Package prefix implements bit readers and writers that use prefix encoding.
 | |
| package prefix
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"sort"
 | |
| 
 | |
| 	"github.com/dsnet/compress/internal"
 | |
| 	"github.com/dsnet/compress/internal/errors"
 | |
| )
 | |
| 
 | |
| func errorf(c int, f string, a ...interface{}) error {
 | |
| 	return errors.Error{Code: c, Pkg: "prefix", Msg: fmt.Sprintf(f, a...)}
 | |
| }
 | |
| 
 | |
| func panicf(c int, f string, a ...interface{}) {
 | |
| 	errors.Panic(errorf(c, f, a...))
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	countBits = 5  // Number of bits to store the bit-length of the code
 | |
| 	valueBits = 27 // Number of bits to store the code value
 | |
| 
 | |
| 	countMask = (1 << countBits) - 1
 | |
| )
 | |
| 
 | |
| // PrefixCode is a representation of a prefix code, which is conceptually a
 | |
| // mapping from some arbitrary symbol to some bit-string.
 | |
| //
 | |
| // The Sym and Cnt fields are typically provided by the user,
 | |
| // while the Len and Val fields are generated by this package.
 | |
| type PrefixCode struct {
 | |
| 	Sym uint32 // The symbol being mapped
 | |
| 	Cnt uint32 // The number times this symbol is used
 | |
| 	Len uint32 // Bit-length of the prefix code
 | |
| 	Val uint32 // Value of the prefix code (must be in 0..(1<<Len)-1)
 | |
| }
 | |
| type PrefixCodes []PrefixCode
 | |
| 
 | |
| type prefixCodesBySymbol []PrefixCode
 | |
| 
 | |
| func (c prefixCodesBySymbol) Len() int           { return len(c) }
 | |
| func (c prefixCodesBySymbol) Less(i, j int) bool { return c[i].Sym < c[j].Sym }
 | |
| func (c prefixCodesBySymbol) Swap(i, j int)      { c[i], c[j] = c[j], c[i] }
 | |
| 
 | |
| type prefixCodesByCount []PrefixCode
 | |
| 
 | |
| func (c prefixCodesByCount) Len() int { return len(c) }
 | |
| func (c prefixCodesByCount) Less(i, j int) bool {
 | |
| 	return c[i].Cnt < c[j].Cnt || (c[i].Cnt == c[j].Cnt && c[i].Sym < c[j].Sym)
 | |
| }
 | |
| func (c prefixCodesByCount) Swap(i, j int) { c[i], c[j] = c[j], c[i] }
 | |
| 
 | |
| func (pc PrefixCodes) SortBySymbol() { sort.Sort(prefixCodesBySymbol(pc)) }
 | |
| func (pc PrefixCodes) SortByCount()  { sort.Sort(prefixCodesByCount(pc)) }
 | |
| 
 | |
| // Length computes the total bit-length using the Len and Cnt fields.
 | |
| func (pc PrefixCodes) Length() (nb uint) {
 | |
| 	for _, c := range pc {
 | |
| 		nb += uint(c.Len * c.Cnt)
 | |
| 	}
 | |
| 	return nb
 | |
| }
 | |
| 
 | |
| // checkLengths reports whether the codes form a complete prefix tree.
 | |
| func (pc PrefixCodes) checkLengths() bool {
 | |
| 	sum := 1 << valueBits
 | |
| 	for _, c := range pc {
 | |
| 		sum -= (1 << valueBits) >> uint(c.Len)
 | |
| 	}
 | |
| 	return sum == 0 || len(pc) == 0
 | |
| }
 | |
| 
 | |
| // checkPrefixes reports whether all codes have non-overlapping prefixes.
 | |
| func (pc PrefixCodes) checkPrefixes() bool {
 | |
| 	for i, c1 := range pc {
 | |
| 		for j, c2 := range pc {
 | |
| 			mask := uint32(1)<<c1.Len - 1
 | |
| 			if i != j && c1.Len <= c2.Len && c1.Val&mask == c2.Val&mask {
 | |
| 				return false
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| // checkCanonical reports whether all codes are canonical.
 | |
| // That is, they have the following properties:
 | |
| //
 | |
| //	1. All codes of a given bit-length are consecutive values.
 | |
| //	2. Shorter codes lexicographically precede longer codes.
 | |
| //
 | |
| // The codes must have unique symbols and be sorted by the symbol
 | |
| // The Len and Val fields in each code must be populated.
 | |
| func (pc PrefixCodes) checkCanonical() bool {
 | |
| 	// Rule 1.
 | |
| 	var vals [valueBits + 1]PrefixCode
 | |
| 	for _, c := range pc {
 | |
| 		if c.Len > 0 {
 | |
| 			c.Val = internal.ReverseUint32N(c.Val, uint(c.Len))
 | |
| 			if vals[c.Len].Cnt > 0 && vals[c.Len].Val+1 != c.Val {
 | |
| 				return false
 | |
| 			}
 | |
| 			vals[c.Len].Val = c.Val
 | |
| 			vals[c.Len].Cnt++
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Rule 2.
 | |
| 	var last PrefixCode
 | |
| 	for _, v := range vals {
 | |
| 		if v.Cnt > 0 {
 | |
| 			curVal := v.Val - v.Cnt + 1
 | |
| 			if last.Cnt != 0 && last.Val >= curVal {
 | |
| 				return false
 | |
| 			}
 | |
| 			last = v
 | |
| 		}
 | |
| 	}
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| // GenerateLengths assigns non-zero bit-lengths to all codes. Codes with high
 | |
| // frequency counts will be assigned shorter codes to reduce bit entropy.
 | |
| // This function is used primarily by compressors.
 | |
| //
 | |
| // The input codes must have the Cnt field populated, be sorted by count.
 | |
| // Even if a code has a count of 0, a non-zero bit-length will be assigned.
 | |
| //
 | |
| // The result will have the Len field populated. The algorithm used guarantees
 | |
| // that Len <= maxBits and that it is a complete prefix tree. The resulting
 | |
| // codes will remain sorted by count.
 | |
| func GenerateLengths(codes PrefixCodes, maxBits uint) error {
 | |
| 	if len(codes) <= 1 {
 | |
| 		if len(codes) == 1 {
 | |
| 			codes[0].Len = 0
 | |
| 		}
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// Verify that the codes are in ascending order by count.
 | |
| 	cntLast := codes[0].Cnt
 | |
| 	for _, c := range codes[1:] {
 | |
| 		if c.Cnt < cntLast {
 | |
| 			return errorf(errors.Invalid, "non-monotonically increasing symbol counts")
 | |
| 		}
 | |
| 		cntLast = c.Cnt
 | |
| 	}
 | |
| 
 | |
| 	// Construct a Huffman tree used to generate the bit-lengths.
 | |
| 	//
 | |
| 	// The Huffman tree is a binary tree where each symbol lies as a leaf node
 | |
| 	// on this tree. The length of the prefix code to assign is the depth of
 | |
| 	// that leaf from the root. The Huffman algorithm, which runs in O(n),
 | |
| 	// is used to generate the tree. It assumes that codes are sorted in
 | |
| 	// increasing order of frequency.
 | |
| 	//
 | |
| 	// The algorithm is as follows:
 | |
| 	//	1. Start with two queues, F and Q, where F contains all of the starting
 | |
| 	//	symbols sorted such that symbols with lowest counts come first.
 | |
| 	//	2. While len(F)+len(Q) > 1:
 | |
| 	//		2a. Dequeue the node from F or Q that has the lowest weight as N0.
 | |
| 	//		2b. Dequeue the node from F or Q that has the lowest weight as N1.
 | |
| 	//		2c. Create a new node N that has N0 and N1 as its children.
 | |
| 	//		2d. Enqueue N into the back of Q.
 | |
| 	//	3. The tree's root node is Q[0].
 | |
| 	type node struct {
 | |
| 		cnt uint32
 | |
| 
 | |
| 		// n0 or c0 represent the left child of this node.
 | |
| 		// Since Go does not have unions, only one of these will be set.
 | |
| 		// Similarly, n1 or c1 represent the right child of this node.
 | |
| 		//
 | |
| 		// If n0 or n1 is set, then it represents a "pointer" to another
 | |
| 		// node in the Huffman tree. Since Go's pointer analysis cannot reason
 | |
| 		// that these node pointers do not escape (golang.org/issue/13493),
 | |
| 		// we use an index to a node in the nodes slice as a pseudo-pointer.
 | |
| 		//
 | |
| 		// If c0 or c1 is set, then it represents a leaf "node" in the
 | |
| 		// Huffman tree. The leaves are the PrefixCode values themselves.
 | |
| 		n0, n1 int // Index to child nodes
 | |
| 		c0, c1 *PrefixCode
 | |
| 	}
 | |
| 	var nodeIdx int
 | |
| 	var nodeArr [1024]node // Large enough to handle most cases on the stack
 | |
| 	nodes := nodeArr[:]
 | |
| 	if len(nodes) < len(codes) {
 | |
| 		nodes = make([]node, len(codes)) // Number of internal nodes < number of leaves
 | |
| 	}
 | |
| 	freqs, queue := codes, nodes[:0]
 | |
| 	for len(freqs)+len(queue) > 1 {
 | |
| 		// These are the two smallest nodes at the front of freqs and queue.
 | |
| 		var n node
 | |
| 		if len(queue) == 0 || (len(freqs) > 0 && freqs[0].Cnt <= queue[0].cnt) {
 | |
| 			n.c0, freqs = &freqs[0], freqs[1:]
 | |
| 			n.cnt += n.c0.Cnt
 | |
| 		} else {
 | |
| 			n.cnt += queue[0].cnt
 | |
| 			n.n0 = nodeIdx // nodeIdx is same as &queue[0] - &nodes[0]
 | |
| 			nodeIdx++
 | |
| 			queue = queue[1:]
 | |
| 		}
 | |
| 		if len(queue) == 0 || (len(freqs) > 0 && freqs[0].Cnt <= queue[0].cnt) {
 | |
| 			n.c1, freqs = &freqs[0], freqs[1:]
 | |
| 			n.cnt += n.c1.Cnt
 | |
| 		} else {
 | |
| 			n.cnt += queue[0].cnt
 | |
| 			n.n1 = nodeIdx // nodeIdx is same as &queue[0] - &nodes[0]
 | |
| 			nodeIdx++
 | |
| 			queue = queue[1:]
 | |
| 		}
 | |
| 		queue = append(queue, n)
 | |
| 	}
 | |
| 	rootIdx := nodeIdx
 | |
| 
 | |
| 	// Search the whole binary tree, noting when we hit each leaf node.
 | |
| 	// We do not care about the exact Huffman tree structure, but rather we only
 | |
| 	// care about depth of each of the leaf nodes. That is, the depth determines
 | |
| 	// how long each symbol is in bits.
 | |
| 	//
 | |
| 	// Since the number of leaves is n, there is at most n internal nodes.
 | |
| 	// Thus, this algorithm runs in O(n).
 | |
| 	var fixBits bool
 | |
| 	var explore func(int, uint)
 | |
| 	explore = func(rootIdx int, level uint) {
 | |
| 		root := &nodes[rootIdx]
 | |
| 
 | |
| 		// Explore left branch.
 | |
| 		if root.c0 == nil {
 | |
| 			explore(root.n0, level+1)
 | |
| 		} else {
 | |
| 			fixBits = fixBits || (level > maxBits)
 | |
| 			root.c0.Len = uint32(level)
 | |
| 		}
 | |
| 
 | |
| 		// Explore right branch.
 | |
| 		if root.c1 == nil {
 | |
| 			explore(root.n1, level+1)
 | |
| 		} else {
 | |
| 			fixBits = fixBits || (level > maxBits)
 | |
| 			root.c1.Len = uint32(level)
 | |
| 		}
 | |
| 	}
 | |
| 	explore(rootIdx, 1)
 | |
| 
 | |
| 	// Fix the bit-lengths if we violate the maxBits requirement.
 | |
| 	if fixBits {
 | |
| 		// Create histogram for number of symbols with each bit-length.
 | |
| 		var symBitsArr [valueBits + 1]uint32
 | |
| 		symBits := symBitsArr[:] // symBits[nb] indicates number of symbols using nb bits
 | |
| 		for _, c := range codes {
 | |
| 			for int(c.Len) >= len(symBits) {
 | |
| 				symBits = append(symBits, 0)
 | |
| 			}
 | |
| 			symBits[c.Len]++
 | |
| 		}
 | |
| 
 | |
| 		// Fudge the tree such that the largest bit-length is <= maxBits.
 | |
| 		// This is accomplish by effectively doing a tree rotation. That is, we
 | |
| 		// increase the bit-length of some higher frequency code, so that the
 | |
| 		// bit-lengths of lower frequency codes can be decreased.
 | |
| 		//
 | |
| 		// Visually, this looks like the following transform:
 | |
| 		//
 | |
| 		//	Level   Before       After
 | |
| 		//	          __          ___
 | |
| 		//	         /  \        /   \
 | |
| 		//	 n-1    X  / \      /\   /\
 | |
| 		//	 n        X  /\    X  X X  X
 | |
| 		//	 n+1        X  X
 | |
| 		//
 | |
| 		var treeRotate func(uint)
 | |
| 		treeRotate = func(nb uint) {
 | |
| 			if symBits[nb-1] == 0 {
 | |
| 				treeRotate(nb - 1)
 | |
| 			}
 | |
| 			symBits[nb-1] -= 1 // Push this node to the level below
 | |
| 			symBits[nb] += 3   // This level gets one node from above, two from below
 | |
| 			symBits[nb+1] -= 2 // Push two nodes to the level above
 | |
| 		}
 | |
| 		for i := uint(len(symBits)) - 1; i > maxBits; i-- {
 | |
| 			for symBits[i] > 0 {
 | |
| 				treeRotate(i - 1)
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		// Assign bit-lengths to each code. Since codes is sorted in increasing
 | |
| 		// order of frequency, that means that the most frequently used symbols
 | |
| 		// should have the shortest bit-lengths. Thus, we copy symbols to codes
 | |
| 		// from the back of codes first.
 | |
| 		cs := codes
 | |
| 		for nb, cnt := range symBits {
 | |
| 			if cnt > 0 {
 | |
| 				pos := len(cs) - int(cnt)
 | |
| 				cs2 := cs[pos:]
 | |
| 				for i := range cs2 {
 | |
| 					cs2[i].Len = uint32(nb)
 | |
| 				}
 | |
| 				cs = cs[:pos]
 | |
| 			}
 | |
| 		}
 | |
| 		if len(cs) != 0 {
 | |
| 			panic("not all codes were used up")
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if internal.Debug && !codes.checkLengths() {
 | |
| 		panic("incomplete prefix tree detected")
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // GeneratePrefixes assigns a prefix value to all codes according to the
 | |
| // bit-lengths. This function is used by both compressors and decompressors.
 | |
| //
 | |
| // The input codes must have the Sym and Len fields populated and be
 | |
| // sorted by symbol. The bit-lengths of each code must be properly allocated,
 | |
| // such that it forms a complete tree.
 | |
| //
 | |
| // The result will have the Val field populated and will produce a canonical
 | |
| // prefix tree. The resulting codes will remain sorted by symbol.
 | |
| func GeneratePrefixes(codes PrefixCodes) error {
 | |
| 	if len(codes) <= 1 {
 | |
| 		if len(codes) == 1 {
 | |
| 			if codes[0].Len != 0 {
 | |
| 				return errorf(errors.Invalid, "degenerate prefix tree with one node")
 | |
| 			}
 | |
| 			codes[0].Val = 0
 | |
| 		}
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// Compute basic statistics on the symbols.
 | |
| 	var bitCnts [valueBits + 1]uint
 | |
| 	c0 := codes[0]
 | |
| 	bitCnts[c0.Len]++
 | |
| 	minBits, maxBits, symLast := c0.Len, c0.Len, c0.Sym
 | |
| 	for _, c := range codes[1:] {
 | |
| 		if c.Sym <= symLast {
 | |
| 			return errorf(errors.Invalid, "non-unique or non-monotonically increasing symbols")
 | |
| 		}
 | |
| 		if minBits > c.Len {
 | |
| 			minBits = c.Len
 | |
| 		}
 | |
| 		if maxBits < c.Len {
 | |
| 			maxBits = c.Len
 | |
| 		}
 | |
| 		bitCnts[c.Len]++ // Histogram of bit counts
 | |
| 		symLast = c.Sym  // Keep track of last symbol
 | |
| 	}
 | |
| 	if minBits == 0 {
 | |
| 		return errorf(errors.Invalid, "invalid prefix bit-length")
 | |
| 	}
 | |
| 
 | |
| 	// Compute the next code for a symbol of a given bit length.
 | |
| 	var nextCodes [valueBits + 1]uint
 | |
| 	var code uint
 | |
| 	for i := minBits; i <= maxBits; i++ {
 | |
| 		code <<= 1
 | |
| 		nextCodes[i] = code
 | |
| 		code += bitCnts[i]
 | |
| 	}
 | |
| 	if code != 1<<maxBits {
 | |
| 		return errorf(errors.Invalid, "degenerate prefix tree")
 | |
| 	}
 | |
| 
 | |
| 	// Assign the code to each symbol.
 | |
| 	for i, c := range codes {
 | |
| 		codes[i].Val = internal.ReverseUint32N(uint32(nextCodes[c.Len]), uint(c.Len))
 | |
| 		nextCodes[c.Len]++
 | |
| 	}
 | |
| 
 | |
| 	if internal.Debug && !codes.checkPrefixes() {
 | |
| 		panic("overlapping prefixes detected")
 | |
| 	}
 | |
| 	if internal.Debug && !codes.checkCanonical() {
 | |
| 		panic("non-canonical prefixes detected")
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func allocUint32s(s []uint32, n int) []uint32 {
 | |
| 	if cap(s) >= n {
 | |
| 		return s[:n]
 | |
| 	}
 | |
| 	return make([]uint32, n, n*3/2)
 | |
| }
 | |
| 
 | |
| func extendSliceUint32s(s [][]uint32, n int) [][]uint32 {
 | |
| 	if cap(s) >= n {
 | |
| 		return s[:n]
 | |
| 	}
 | |
| 	ss := make([][]uint32, n, n*3/2)
 | |
| 	copy(ss, s[:cap(s)])
 | |
| 	return ss
 | |
| }
 |