batteries/stringx.lua

--[[
	extra string routines
]]

local path = (...):gsub("stringx", "")
local assert = require(path .. "assert")
local pretty = require(path .. "pretty")

local stringx = setmetatable({}, {
	__index = string
})

--split a string on a delimiter into an ordered table
function stringx.split(self, delim, limit)
	delim = delim or ""
	limit = (limit ~= nil and limit) or math.huge

	assert:type(self, "string", "stringx.split - self", 1)
	assert:type(delim, "string", "stringx.split - delim", 1)
	assert:type(limit, "number", "stringx.split - limit", 1)

	if limit then
		assert(limit >= 0, "max_split must be positive!")
	end

	--we try to create as little garbage as possible!
	--only one table to contain the result, plus the split strings.
	--so we do two passes, and  work with the bytes underlying the string
	--partly because string.find is not compiled on older luajit :)
	local res = {}
	local length = self:len()
	--
	local delim_length = delim:len()
	--empty delim? split to individual characters
	if delim_length == 0 then
		for i = 1, length do
			table.insert(res, self:sub(i, i))
		end
		return res
	end
	local delim_start = delim:byte(1)
	--pass 1
	--collect split sites
	local i = 1
	while i <= length do
		--scan for delimiter
		if self:byte(i) == delim_start then
			local has_whole_delim = true
			for j = 2, delim_length do
				if self:byte(i + j - 1) ~= delim:byte(j) then
					has_whole_delim = false
					break
				end
			end
			if has_whole_delim then
				if #res < limit then
					table.insert(res, i)
				else
					break
				end
			end
			--iterate forward
			i = i + delim_length
		else
			--iterate forward
			i = i + 1
		end
	end
	--pass 2
	--collect substrings
	i = 1
	for si, j in ipairs(res) do
		res[si] = self:sub(i, j-1)
		i = j + delim_length
	end
	--add the final section
	table.insert(res, self:sub(i, -1))
	--return the collection
	return res
end

stringx.pretty = pretty.string

--(generate a map of whitespace byte values)
local _whitespace_bytes = {}
do
	local _whitespace = " \t\n\r"
	for i = 1, _whitespace:len() do
		_whitespace_bytes[_whitespace:byte(i)] = true
	end
end

--trim all whitespace off the head and tail of a string
--	specifically trims space, tab, newline, and carriage return characters
--	ignores form feeds, vertical tabs, and backspaces
--
--	only generates one string of garbage in the case there's actually space to trim
function stringx.trim(s)
	--cache
	local len = s:len()

	--we search for the head and tail of the string iteratively
	--we could fuse these loops, but two separate loops is a lot easier to follow
	--and branches less as well.
	local head = 0
	for i = 1, len do
		if not _whitespace_bytes[s:byte(i)] then
			head = i
			break
		end
	end

	local tail = 0
	for i = len, 1, -1 do
		if not _whitespace_bytes[s:byte(i)] then
			tail = i
			break
		end
	end

	--overlapping ranges means no content
	if head > tail then
		return ""
	end
	--limit ranges means no trim
	if head == 1 and tail == len then
		return s
	end

	--pull out the content
	return s:sub(head, tail)
end

--trim the start of a string
function stringx.ltrim(s)
	local head = 1
	for i = 1, #s do
		if not _whitespace_bytes[s:byte(i)] then
			head = i
			break
		end
	end
	if head == 1 then
		return s
	end
	return s:sub(head)
end

--trim the end of a string
function stringx.rtrim(s)
	local tail = #s

	for i = #s, 1, -1 do
		if not _whitespace_bytes[s:byte(i)] then
			tail = i
			break
		end
	end

	if tail == #s then
		return s
	end

	return s:sub(1, tail)
end

function stringx.deindent(s, keep_trailing_empty)
	--detect windows or unix newlines
	local windows_newlines = s:find("\r\n", nil, true)
	local newline = windows_newlines and "\r\n" or "\n"
	--split along newlines
	local lines = stringx.split(s, newline)
	--detect and strip any leading blank lines
	while lines[1] == "" do
		table.remove(lines, 1)
	end

	--nothing to do
	if #lines == 0 then
		return ""
	end

	--detect indent
	local _, _, indent = lines[1]:find("^([ \t]*)")
	local indent_len = indent and indent:len() or 0

	--not indented
	if indent_len == 0 then
		return table.concat(lines, newline)
	end

	--de-indent the lines
	local res = {}
	for _, line in ipairs(lines) do
		if line ~= "" then
			local line_start = line:sub(1, indent:len())
			local start_len = line_start:len()
			if
				line_start == indent
				or (
					start_len < indent_len
					and line_start == indent:sub(1, start_len)
				)
			then
				line = line:sub(start_len + 1)
			end
		end
		table.insert(res, line)
	end

	--should we keep any trailing empty lines?
	if not keep_trailing_empty then
		while res[#res] == "" do
			table.remove(res)
		end
	end

	return table.concat(res, newline)
end

--alias
stringx.dedent = stringx.deindent

--apply a template to a string
--supports $template style values, given as a table or function
-- ie ("hello $name"):format({name = "tom"}) == "hello tom"
function stringx.apply_template(s, sub)
	local r = s:gsub("%$([%w_]+)", sub)
	return r
end

--check if a given string contains another
--(without garbage)
function stringx.contains(haystack, needle)
	for i = 1, #haystack - #needle + 1 do
		local found = true
		for j = 1, #needle do
			if haystack:byte(i + j - 1) ~= needle:byte(j) then
				found = false
				break
			end
		end
		if found then
			return true
		end
	end
	return false
end

--check if a given string starts with another
--(without garbage)
--Using loops is actually faster than string.find!
function stringx.starts_with(s, prefix)
	for i = 1, #prefix do
		if s:byte(i) ~= prefix:byte(i) then
			return false
		end
	end
	return true
end

--check if a given string ends with another
--(without garbage)
function stringx.ends_with(s, suffix)
	local len = #s
	local suffix_len = #suffix
	for i = 0, suffix_len - 1 do
		if s:byte(len - i) ~= suffix:byte(suffix_len - i) then
			return false
		end
	end
	return true
end

--split elements by delimiter and trim the results, discarding empties
--useful for hand-entered "permissive" data
--	"a,b,  c, " -> {"a", "b", "c"}
function stringx.split_and_trim(s, delim)
	s = stringx.split(s, delim)
	for i = #s, 1, -1 do
		local v = stringx.trim(s[i])
		if v == "" then
			table.remove(s, i)
		else
			s[i] = v
		end
	end
	return s
end

return stringx
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`--[[`
			`extra string routines`
			`]]`

added pretty module with pretty.print and pretty.string breaking: string.pretty still exists but takes new config argument 2021-07-05 06:12:16 +00:00			`local path = (...):gsub("stringx", "")`
[modified] refactored init.lua extensively; used new assert module in stringx and tablex 2020-05-19 02:03:45 +00:00			`local assert = require(path .. "assert")`
added pretty module with pretty.print and pretty.string breaking: string.pretty still exists but takes new config argument 2021-07-05 06:12:16 +00:00			`local pretty = require(path .. "pretty")`
[modified] refactored init.lua extensively; used new assert module in stringx and tablex 2020-05-19 02:03:45 +00:00
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`local stringx = setmetatable({}, {`
			`__index = string`
			`})`

			`--split a string on a delimiter into an ordered table`
max_split -> limit, allow zero limit 2021-12-26 16:32:43 +00:00			`function stringx.split(self, delim, limit)`
stringx.split delimiter optional 2021-07-05 22:29:13 +00:00			`delim = delim or ""`
max_split -> limit, allow zero limit 2021-12-26 16:32:43 +00:00			`limit = (limit ~= nil and limit) or math.huge`
add max_split to stringx.split 2021-12-24 18:03:36 +00:00
[modified] refactored init.lua extensively; used new assert module in stringx and tablex 2020-05-19 02:03:45 +00:00			`assert:type(self, "string", "stringx.split - self", 1)`
			`assert:type(delim, "string", "stringx.split - delim", 1)`
max_split -> limit, allow zero limit 2021-12-26 16:32:43 +00:00			`assert:type(limit, "number", "stringx.split - limit", 1)`
[modified] refactored init.lua extensively; used new assert module in stringx and tablex 2020-05-19 02:03:45 +00:00
max_split -> limit, allow zero limit 2021-12-26 16:32:43 +00:00			`if limit then`
			`assert(limit >= 0, "max_split must be positive!")`
fix spacing 2021-12-24 18:19:38 +00:00			`end`
max_split -> limit, allow zero limit 2021-12-26 16:32:43 +00:00
[modified] moved tablex.stringify to stringx.pretty 2020-04-17 00:45:15 +00:00			`--we try to create as little garbage as possible!`
			`--only one table to contain the result, plus the split strings.`
			`--so we do two passes, and work with the bytes underlying the string`
			`--partly because string.find is not compiled on older luajit :)`
			`local res = {}`
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`local length = self:len()`
			`--`
			`local delim_length = delim:len()`
[modified] moved tablex.stringify to stringx.pretty 2020-04-17 00:45:15 +00:00			`--empty delim? split to individual characters`
			`if delim_length == 0 then`
			`for i = 1, length do`
			`table.insert(res, self:sub(i, i))`
			`end`
			`return res`
			`end`
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`local delim_start = delim:byte(1)`
[modified] moved tablex.stringify to stringx.pretty 2020-04-17 00:45:15 +00:00			`--pass 1`
			`--collect split sites`
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`local i = 1`
			`while i <= length do`
			`--scan for delimiter`
			`if self:byte(i) == delim_start then`
			`local has_whole_delim = true`
			`for j = 2, delim_length do`
			`if self:byte(i + j - 1) ~= delim:byte(j) then`
			`has_whole_delim = false`
			`break`
			`end`
			`end`
			`if has_whole_delim then`
max_split -> limit, allow zero limit 2021-12-26 16:32:43 +00:00			`if #res < limit then`
add max_split to stringx.split 2021-12-24 18:03:36 +00:00			`table.insert(res, i)`
			`else`
			`break`
			`end`
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`end`
			`--iterate forward`
			`i = i + delim_length`
			`else`
			`--iterate forward`
			`i = i + 1`
			`end`
			`end`
[modified] moved tablex.stringify to stringx.pretty 2020-04-17 00:45:15 +00:00			`--pass 2`
			`--collect substrings`
[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`i = 1`
			`for si, j in ipairs(res) do`
			`res[si] = self:sub(i, j-1)`
			`i = j + delim_length`
			`end`
			`--add the final section`
			`table.insert(res, self:sub(i, -1))`
			`--return the collection`
			`return res`
			`end`

added pretty module with pretty.print and pretty.string breaking: string.pretty still exists but takes new config argument 2021-07-05 06:12:16 +00:00			`stringx.pretty = pretty.string`
[modified] moved tablex.stringify to stringx.pretty 2020-04-17 00:45:15 +00:00
[added] stringx.trim 2020-11-12 05:36:06 +00:00			`--(generate a map of whitespace byte values)`
			`local _whitespace_bytes = {}`
			`do`
			`local _whitespace = " \t\n\r"`
			`for i = 1, _whitespace:len() do`
			`_whitespace_bytes[_whitespace:byte(i)] = true`
			`end`
			`end`

			`--trim all whitespace off the head and tail of a string`
			`-- specifically trims space, tab, newline, and carriage return characters`
			`-- ignores form feeds, vertical tabs, and backspaces`
			`--`
			`-- only generates one string of garbage in the case there's actually space to trim`
			`function stringx.trim(s)`
			`--cache`
			`local len = s:len()`

			`--we search for the head and tail of the string iteratively`
			`--we could fuse these loops, but two separate loops is a lot easier to follow`
			`--and branches less as well.`
			`local head = 0`
			`for i = 1, len do`
			`if not _whitespace_bytes[s:byte(i)] then`
			`head = i`
			`break`
			`end`
			`end`

			`local tail = 0`
			`for i = len, 1, -1 do`
			`if not _whitespace_bytes[s:byte(i)] then`
			`tail = i`
			`break`
			`end`
			`end`

			`--overlapping ranges means no content`
			`if head > tail then`
			`return ""`
			`end`
			`--limit ranges means no trim`
			`if head == 1 and tail == len then`
			`return s`
			`end`

			`--pull out the content`
			`return s:sub(head, tail)`
			`end`

[modified] PR #17 a little; simplified various unneeded sections and added comments 2021-04-14 07:06:31 +00:00			`--trim the start of a string`
Added ltrim and rtrim methods complementary methods to trim 2021-04-06 02:43:35 +00:00			`function stringx.ltrim(s)`
Refactored rtrim and ltrim to use _whitespace_bytes table 2021-04-12 15:38:03 +00:00			`local head = 1`
			`for i = 1, #s do`
			`if not _whitespace_bytes[s:byte(i)] then`
			`head = i`
			`break`
			`end`
			`end`
[modified] PR #17 a little; simplified various unneeded sections and added comments 2021-04-14 07:06:31 +00:00			`if head == 1 then`
			`return s`
			`end`
Refactored rtrim and ltrim to use _whitespace_bytes table 2021-04-12 15:38:03 +00:00			`return s:sub(head)`
Added ltrim and rtrim methods complementary methods to trim 2021-04-06 02:43:35 +00:00			`end`

[modified] PR #17 a little; simplified various unneeded sections and added comments 2021-04-14 07:06:31 +00:00			`--trim the end of a string`
Added ltrim and rtrim methods complementary methods to trim 2021-04-06 02:43:35 +00:00			`function stringx.rtrim(s)`
			`local tail = #s`
Refactored rtrim and ltrim to use _whitespace_bytes table 2021-04-12 15:38:03 +00:00
			`for i = #s, 1, -1 do`
			`if not _whitespace_bytes[s:byte(i)] then`
			`tail = i`
			`break`
			`end`
Added ltrim and rtrim methods complementary methods to trim 2021-04-06 02:43:35 +00:00			`end`
Refactored rtrim and ltrim to use _whitespace_bytes table 2021-04-12 15:38:03 +00:00
[modified] PR #17 a little; simplified various unneeded sections and added comments 2021-04-14 07:06:31 +00:00			`if tail == #s then`
			`return s`
			`end`

Added ltrim and rtrim methods complementary methods to trim 2021-04-06 02:43:35 +00:00			`return s:sub(1, tail)`
			`end`

[added] stringx.deindent (and alias dedent) 2020-11-12 03:32:56 +00:00			`function stringx.deindent(s, keep_trailing_empty)`
			`--detect windows or unix newlines`
			`local windows_newlines = s:find("\r\n", nil, true)`
			`local newline = windows_newlines and "\r\n" or "\n"`
			`--split along newlines`
			`local lines = stringx.split(s, newline)`
			`--detect and strip any leading blank lines`
			`while lines[1] == "" do`
			`table.remove(lines, 1)`
			`end`

			`--nothing to do`
			`if #lines == 0 then`
			`return ""`
			`end`

			`--detect indent`
			`local _, _, indent = lines[1]:find("^([ \t]*)")`
			`local indent_len = indent and indent:len() or 0`

			`--not indented`
			`if indent_len == 0 then`
			`return table.concat(lines, newline)`
			`end`

			`--de-indent the lines`
			`local res = {}`
			`for _, line in ipairs(lines) do`
fixed dedent() with trimmed blank lines (some editors do this automatically) 2022-09-20 05:00:44 +00:00			`if line ~= "" then`
			`local line_start = line:sub(1, indent:len())`
			`local start_len = line_start:len()`
			`if`
			`line_start == indent`
			`or (`
			`start_len < indent_len`
			`and line_start == indent:sub(1, start_len)`
			`)`
			`then`
			`line = line:sub(start_len + 1)`
			`end`
[added] stringx.deindent (and alias dedent) 2020-11-12 03:32:56 +00:00			`end`
			`table.insert(res, line)`
			`end`

made stringx.deindent remove all trailing empties rather than just one 2022-07-11 06:44:51 +00:00			`--should we keep any trailing empty lines?`
[added] stringx.deindent (and alias dedent) 2020-11-12 03:32:56 +00:00			`if not keep_trailing_empty then`
made stringx.deindent remove all trailing empties rather than just one 2022-07-11 06:44:51 +00:00			`while res[#res] == "" do`
[added] stringx.deindent (and alias dedent) 2020-11-12 03:32:56 +00:00			`table.remove(res)`
			`end`
			`end`

			`return table.concat(res, newline)`
			`end`

			`--alias`
			`stringx.dedent = stringx.deindent`

[added] stringx.apply_template for "$template_var" containing strings 2020-11-19 06:18:38 +00:00			`--apply a template to a string`
			`--supports $template style values, given as a table or function`
			`-- ie ("hello $name"):format({name = "tom"}) == "hello tom"`
			`function stringx.apply_template(s, sub)`
stringx.apply_template works with $snake_case_symbols 2021-07-13 06:47:19 +00:00			`local r = s:gsub("%$([%w_]+)", sub)`
[added] stringx.apply_template for "$template_var" containing strings 2020-11-19 06:18:38 +00:00			`return r`
			`end`

added stringx.contains 2021-07-05 22:29:30 +00:00			`--check if a given string contains another`
			`--(without garbage)`
			`function stringx.contains(haystack, needle)`
			`for i = 1, #haystack - #needle + 1 do`
			`local found = true`
			`for j = 1, #needle do`
			`if haystack:byte(i + j - 1) ~= needle:byte(j) then`
			`found = false`
			`break`
			`end`
			`end`
			`if found then`
			`return true`
			`end`
			`end`
			`return false`
			`end`

[modified] stringx.starts_with to not generate garbage 2021-03-02 00:47:28 +00:00			`--check if a given string starts with another`
			`--(without garbage)`
Add comments for things that surprised me As a new user, there were things I was skeptical about and after digging in, these were my conclusions. Compared to the simple and obvious lua wiki solutions, batteries' string functions are slightly faster. GC is the same. Test local str = "hello world" local fn = function() local x = 0 if stringx.ends_with(str, "h") then x = x + 1 end if stringx.ends_with(str, "helll") then x = x + 1 end if stringx.ends_with(str, "helicopter") then x = x + 1 end end local pretty = require "inspect" print("stringx =", pretty({ time_taken = {measure.time_taken(fn, 10000)}, memory_taken = {measure.memory_taken(fn, 10000)} })) local function starts_with(str, prefix) return str:find(prefix, 1, true) == 1 end local function ends_with(str, ending) return ending == "" or str:sub(-#ending) == ending end local fn = function() local x = 0 if ends_with(str, "h") then x = x + 1 end if ends_with(str, "helll") then x = x + 1 end if ends_with(str, "helicopter") then x = x + 1 end end print("find =", pretty({ time_taken = {measure.time_taken(fn, 10000)}, memory_taken = {measure.memory_taken(fn, 10000)} })) starts_with =========== stringx = { memory_taken = { 0, 0, 0 }, time_taken = { 1.5098012518138e-007, 9.988434612751e-008, 2.1699932403862e-005 } } find = { memory_taken = { 0, 0, 0 }, time_taken = { 2.7349997544661e-007, 1.9988510757685e-007, 9.1999536380172e-006 } } ends_with ========= stringx = { memory_taken = { 0, 0, 0 }, time_taken = { 9.0479978825897e-008, 0, 2.5199959054589e-005 } } find = { memory_taken = { 0, 0, 0 }, time_taken = { 2.1833006758243e-007, 1.9988510757685e-007, 6.1000464484096e-006 } } 2022-03-02 18:30:52 +00:00			`--Using loops is actually faster than string.find!`
[modified] stringx.starts_with to not generate garbage 2021-03-02 00:47:28 +00:00			`function stringx.starts_with(s, prefix)`
			`for i = 1, #prefix do`
			`if s:byte(i) ~= prefix:byte(i) then`
			`return false`
			`end`
			`end`
			`return true`
[added] stringx.starts_with 2021-03-02 00:45:19 +00:00			`end`

[modified] PR #17 a little; simplified various unneeded sections and added comments 2021-04-14 07:06:31 +00:00			`--check if a given string ends with another`
			`--(without garbage)`
			`function stringx.ends_with(s, suffix)`
			`local len = #s`
			`local suffix_len = #suffix`
			`for i = 0, suffix_len - 1 do`
			`if s:byte(len - i) ~= suffix:byte(suffix_len - i) then`
Added ends_with method to stringx 2021-04-07 16:59:42 +00:00			`return false`
			`end`
			`end`
			`return true`
			`end`

added stringx.split_and_trim to save me inlining something with the same effect all over the place 2022-10-24 05:08:39 +00:00			`--split elements by delimiter and trim the results, discarding empties`
			`--useful for hand-entered "permissive" data`
			`-- "a,b, c, " -> {"a", "b", "c"}`
			`function stringx.split_and_trim(s, delim)`
			`s = stringx.split(s, delim)`
			`for i = #s, 1, -1 do`
			`local v = stringx.trim(s[i])`
			`if v == "" then`
			`table.remove(s, i)`
			`else`
			`s[i] = v`
			`end`
			`end`
			`return s`
			`end`

[added] stringx module (just split for now) and overlay to string on export 2020-04-17 00:35:00 +00:00			`return stringx`