batteries/stringx.lua

303 lines
6.3 KiB
Lua
Raw Permalink Normal View History

--[[
extra string routines
]]
local path = (...):gsub("stringx", "")
local assert = require(path .. "assert")
local pretty = require(path .. "pretty")
local stringx = setmetatable({}, {
__index = string
})
--split a string on a delimiter into an ordered table
2021-12-26 16:32:43 +00:00
function stringx.split(self, delim, limit)
2021-07-05 22:29:13 +00:00
delim = delim or ""
2021-12-26 16:32:43 +00:00
limit = (limit ~= nil and limit) or math.huge
2021-12-24 18:03:36 +00:00
assert:type(self, "string", "stringx.split - self", 1)
assert:type(delim, "string", "stringx.split - delim", 1)
2021-12-26 16:32:43 +00:00
assert:type(limit, "number", "stringx.split - limit", 1)
2021-12-26 16:32:43 +00:00
if limit then
assert(limit >= 0, "max_split must be positive!")
2021-12-24 18:19:38 +00:00
end
2021-12-26 16:32:43 +00:00
--we try to create as little garbage as possible!
--only one table to contain the result, plus the split strings.
--so we do two passes, and work with the bytes underlying the string
--partly because string.find is not compiled on older luajit :)
local res = {}
local length = self:len()
--
local delim_length = delim:len()
--empty delim? split to individual characters
if delim_length == 0 then
for i = 1, length do
table.insert(res, self:sub(i, i))
end
return res
end
local delim_start = delim:byte(1)
--pass 1
--collect split sites
local i = 1
while i <= length do
--scan for delimiter
if self:byte(i) == delim_start then
local has_whole_delim = true
for j = 2, delim_length do
if self:byte(i + j - 1) ~= delim:byte(j) then
has_whole_delim = false
--step forward as far as we got
i = i + j
break
end
end
if has_whole_delim then
2021-12-26 16:32:43 +00:00
if #res < limit then
2021-12-24 18:03:36 +00:00
table.insert(res, i)
--iterate forward the whole delimiter
i = i + delim_length
2021-12-24 18:03:36 +00:00
else
break
end
end
else
--iterate forward
i = i + 1
end
end
--pass 2
--collect substrings
i = 1
for si, j in ipairs(res) do
res[si] = self:sub(i, j-1)
i = j + delim_length
end
--add the final section
table.insert(res, self:sub(i, -1))
--return the collection
return res
end
stringx.pretty = pretty.string
2020-11-12 05:36:06 +00:00
--(generate a map of whitespace byte values)
local _whitespace_bytes = {}
do
local _whitespace = " \t\n\r"
for i = 1, _whitespace:len() do
_whitespace_bytes[_whitespace:byte(i)] = true
end
end
--trim all whitespace off the head and tail of a string
-- specifically trims space, tab, newline, and carriage return characters
-- ignores form feeds, vertical tabs, and backspaces
--
-- only generates one string of garbage in the case there's actually space to trim
function stringx.trim(s)
--cache
local len = s:len()
--we search for the head and tail of the string iteratively
--we could fuse these loops, but two separate loops is a lot easier to follow
--and branches less as well.
local head = 0
for i = 1, len do
if not _whitespace_bytes[s:byte(i)] then
head = i
break
end
end
local tail = 0
for i = len, 1, -1 do
if not _whitespace_bytes[s:byte(i)] then
tail = i
break
end
end
--overlapping ranges means no content
if head > tail then
return ""
end
--limit ranges means no trim
if head == 1 and tail == len then
return s
end
--pull out the content
return s:sub(head, tail)
end
--trim the start of a string
function stringx.ltrim(s)
local head = 1
for i = 1, #s do
if not _whitespace_bytes[s:byte(i)] then
head = i
break
end
end
if head == 1 then
return s
end
return s:sub(head)
end
--trim the end of a string
function stringx.rtrim(s)
local tail = #s
for i = #s, 1, -1 do
if not _whitespace_bytes[s:byte(i)] then
tail = i
break
end
end
if tail == #s then
return s
end
return s:sub(1, tail)
end
function stringx.deindent(s, keep_trailing_empty)
--detect windows or unix newlines
local windows_newlines = s:find("\r\n", nil, true)
local newline = windows_newlines and "\r\n" or "\n"
--split along newlines
local lines = stringx.split(s, newline)
--detect and strip any leading blank lines
while lines[1] == "" do
table.remove(lines, 1)
end
--nothing to do
if #lines == 0 then
return ""
end
--detect indent
local _, _, indent = lines[1]:find("^([ \t]*)")
local indent_len = indent and indent:len() or 0
--not indented
if indent_len == 0 then
return table.concat(lines, newline)
end
--de-indent the lines
local res = {}
for _, line in ipairs(lines) do
if line ~= "" then
local line_start = line:sub(1, indent:len())
local start_len = line_start:len()
if
line_start == indent
or (
start_len < indent_len
and line_start == indent:sub(1, start_len)
)
then
line = line:sub(start_len + 1)
end
end
table.insert(res, line)
end
--should we keep any trailing empty lines?
if not keep_trailing_empty then
while res[#res] == "" do
table.remove(res)
end
end
return table.concat(res, newline)
end
--alias
stringx.dedent = stringx.deindent
--apply a template to a string
--supports $template style values, given as a table or function
-- ie ("hello $name"):format({name = "tom"}) == "hello tom"
function stringx.apply_template(s, sub)
local r = s:gsub("%$([%w_]+)", sub)
return r
end
2021-07-05 22:29:30 +00:00
--check if a given string contains another
--(without garbage)
function stringx.contains(haystack, needle)
for i = 1, #haystack - #needle + 1 do
local found = true
for j = 1, #needle do
if haystack:byte(i + j - 1) ~= needle:byte(j) then
found = false
break
end
end
if found then
return true
end
end
return false
end
--check if a given string starts with another
--(without garbage)
Add comments for things that surprised me As a new user, there were things I was skeptical about and after digging in, these were my conclusions. Compared to the simple and obvious lua wiki solutions, batteries' string functions are slightly faster. GC is the same. Test local str = "hello world" local fn = function() local x = 0 if stringx.ends_with(str, "h") then x = x + 1 end if stringx.ends_with(str, "helll") then x = x + 1 end if stringx.ends_with(str, "helicopter") then x = x + 1 end end local pretty = require "inspect" print("stringx =", pretty({ time_taken = {measure.time_taken(fn, 10000)}, memory_taken = {measure.memory_taken(fn, 10000)} })) local function starts_with(str, prefix) return str:find(prefix, 1, true) == 1 end local function ends_with(str, ending) return ending == "" or str:sub(-#ending) == ending end local fn = function() local x = 0 if ends_with(str, "h") then x = x + 1 end if ends_with(str, "helll") then x = x + 1 end if ends_with(str, "helicopter") then x = x + 1 end end print("find =", pretty({ time_taken = {measure.time_taken(fn, 10000)}, memory_taken = {measure.memory_taken(fn, 10000)} })) starts_with =========== stringx = { memory_taken = { 0, 0, 0 }, time_taken = { 1.5098012518138e-007, 9.988434612751e-008, 2.1699932403862e-005 } } find = { memory_taken = { 0, 0, 0 }, time_taken = { 2.7349997544661e-007, 1.9988510757685e-007, 9.1999536380172e-006 } } ends_with ========= stringx = { memory_taken = { 0, 0, 0 }, time_taken = { 9.0479978825897e-008, 0, 2.5199959054589e-005 } } find = { memory_taken = { 0, 0, 0 }, time_taken = { 2.1833006758243e-007, 1.9988510757685e-007, 6.1000464484096e-006 } }
2022-03-02 18:30:52 +00:00
--Using loops is actually faster than string.find!
function stringx.starts_with(s, prefix)
for i = 1, #prefix do
if s:byte(i) ~= prefix:byte(i) then
return false
end
end
return true
2021-03-02 00:45:19 +00:00
end
--check if a given string ends with another
--(without garbage)
function stringx.ends_with(s, suffix)
local len = #s
local suffix_len = #suffix
for i = 0, suffix_len - 1 do
if s:byte(len - i) ~= suffix:byte(suffix_len - i) then
2021-04-07 16:59:42 +00:00
return false
end
end
return true
end
--split elements by delimiter and trim the results, discarding empties
--useful for hand-entered "permissive" data
-- "a,b, c, " -> {"a", "b", "c"}
function stringx.split_and_trim(s, delim)
s = stringx.split(s, delim)
for i = #s, 1, -1 do
local v = stringx.trim(s[i])
if v == "" then
table.remove(s, i)
else
s[i] = v
end
end
return s
end
2023-02-12 22:00:04 +00:00
--titlizes a string
--"quick brown fox" becomes "Quick Brown Fox"
function stringx.title_case(s)
s = s:gsub("%s%l", string.upper)
s = s:gsub("^%l", string.upper)
return s
end
return stringx