[flang] Implement SPLIT intrinsic subroutine with tests (#185584)

This is the implementation of part of F2023 new feature US 03.
Extracting tokens from a string, SPLIT intrinsic.

It's section 16.9.196 SPLIT (STRING, SET, POS [, BACK]) of Fortran 2023
Standard.

It's part of Flang issue
[#178044](https://github.com/llvm/llvm-project/issues/178044). Note that
I work with @kwyatt-ext on this issue. He implemented the other part,
TOKENIZE.

A test will be added into
[llvm-test-suite](https://github.com/llvm/llvm-test-suite) later after
this PR is merged.
This commit is contained in:
laoshd 2026-03-20 11:12:51 -06:00 committed by GitHub
parent 68a9e9ca3e
commit 5a14e4f231
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 404 additions and 40 deletions

View File

@ -992,6 +992,42 @@ static RT_API_ATTRS void TokenizePositionsImpl(Descriptor &first,
}
}
// SPLIT — implemented in terms of SCAN.
// When BACK is false, returns the position of the leftmost character in SET
// at a position > POS, or LEN(STRING)+1 if none.
// When BACK is true, returns the position of the rightmost character in SET
// at a position < POS, or 0 if none.
template <typename CHAR>
static RT_API_ATTRS std::size_t SplitImpl(const CHAR *string,
std::size_t stringLen, const CHAR *set, std::size_t setLen, std::size_t pos,
bool back) {
if (back) {
std::size_t scanLen{pos > 1 ? pos - 1 : std::size_t{0}};
if (scanLen > stringLen) {
scanLen = stringLen;
}
if constexpr (sizeof(CHAR) == 1) {
return ScanVerify<false>(string, scanLen, set, setLen, true);
} else {
return ScanVerify<CHAR, CharFunc::Scan>(
string, scanLen, set, setLen, true);
}
} else {
if (pos >= stringLen) {
return stringLen + 1;
}
std::size_t npos;
if constexpr (sizeof(CHAR) == 1) {
npos =
ScanVerify<false>(string + pos, stringLen - pos, set, setLen, false);
} else {
npos = ScanVerify<CHAR, CharFunc::Scan>(
string + pos, stringLen - pos, set, setLen, false);
}
return npos != 0 ? pos + npos : stringLen + 1;
}
}
extern "C" {
RT_EXT_API_GROUP_BEGIN
@ -1375,6 +1411,21 @@ void RTDEF(TokenizePositions)(Descriptor &first, Descriptor &last,
TokenizePositionsImpl(first, last, string, set, terminator);
}
std::size_t RTDEF(Split1)(const char *string, std::size_t stringLen,
const char *set, std::size_t setLen, std::size_t pos, bool back) {
return SplitImpl(string, stringLen, set, setLen, pos, back);
}
std::size_t RTDEF(Split2)(const char16_t *string, std::size_t stringLen,
const char16_t *set, std::size_t setLen, std::size_t pos, bool back) {
return SplitImpl(string, stringLen, set, setLen, pos, back);
}
std::size_t RTDEF(Split4)(const char32_t *string, std::size_t stringLen,
const char32_t *set, std::size_t setLen, std::size_t pos, bool back) {
return SplitImpl(string, stringLen, set, setLen, pos, back);
}
RT_EXT_API_GROUP_END
}
} // namespace Fortran::runtime

View File

@ -392,6 +392,91 @@ TYPED_TEST(SearchTests, VerifyTests) {
"VERIFY", tests, std::get<SearchFunction<TypeParam>>(functions));
}
// Test SPLIT()
template <typename CHAR>
using SplitFunction = std::function<std::size_t(
const CHAR *, std::size_t, const CHAR *, std::size_t, std::size_t, bool)>;
using SplitFunctions = std::tuple<SplitFunction<char>, SplitFunction<char16_t>,
SplitFunction<char32_t>>;
struct SplitTestCase {
const char *string, *set;
std::size_t pos;
bool back;
std::size_t expect;
};
template <typename CHAR>
void RunSplitTests(const char *which,
const std::vector<SplitTestCase> &testCases,
const SplitFunction<CHAR> &function) {
for (const auto &t : testCases) {
std::size_t strLen{std::strlen(t.string)}, setLen{std::strlen(t.set)};
std::basic_string<CHAR> str{t.string, t.string + strLen};
std::basic_string<CHAR> set{t.set, t.set + setLen};
auto got{function(str.data(), strLen, set.data(), setLen, t.pos, t.back)};
ASSERT_EQ(got, t.expect)
<< which << "('" << t.string << "','" << t.set << "',pos=" << t.pos
<< ",back=" << t.back << ") for CHARACTER(kind=" << sizeof(CHAR)
<< "): got " << got << ", expected " << t.expect;
}
}
template <typename CHAR> struct SplitTests : public ::testing::Test {};
TYPED_TEST_SUITE(SplitTests, CharacterTypes, );
TYPED_TEST(SplitTests, SplitForward) {
static SplitFunctions functions{
RTNAME(Split1), RTNAME(Split2), RTNAME(Split4)};
static std::vector<SplitTestCase> tests{
// "one,two,three" with set=","
// Forward scanning: from pos=0, find first ',' at position 4
{"one,two,three", ",", 0, false, 4},
// From pos=4, find next ',' at position 8
{"one,two,three", ",", 4, false, 8},
// From pos=8, no more ',', return len+1=14
{"one,two,three", ",", 8, false, 14},
// Empty string
{"", ",", 0, false, 1},
// No delimiters in string
{"abc", ",", 0, false, 4},
// String is all delimiters
{",,", ",", 0, false, 1},
{",,", ",", 1, false, 2},
{",,", ",", 2, false, 3},
// pos at end of string
{"abc", ",", 3, false, 4},
// Multiple delimiter characters in set
{"a,b;c", ",;", 0, false, 2},
{"a,b;c", ",;", 2, false, 4},
{"a,b;c", ",;", 4, false, 6},
};
RunSplitTests(
"SPLIT(forward)", tests, std::get<SplitFunction<TypeParam>>(functions));
}
TYPED_TEST(SplitTests, SplitBackward) {
static SplitFunctions functions{
RTNAME(Split1), RTNAME(Split2), RTNAME(Split4)};
static std::vector<SplitTestCase> tests{
// "one,two,three" with set=","
// Backward scanning: from pos=14 (len+1), find last ',' at position 8
{"one,two,three", ",", 14, true, 8},
// From pos=8, find previous ',' at position 4
{"one,two,three", ",", 8, true, 4},
// From pos=4, no ',' before position 4, return 0
{"one,two,three", ",", 4, true, 0},
// Empty string
{"", ",", 1, true, 0},
// pos=0 or pos=1 should return 0
{"abc", ",", 0, true, 0},
{"abc", ",", 1, true, 0},
// No delimiters in string
{"abc", ",", 4, true, 0},
};
RunSplitTests(
"SPLIT(backward)", tests, std::get<SplitFunction<TypeParam>>(functions));
}
// Test REPEAT()
template <typename CHAR> struct RepeatTests : public ::testing::Test {};
TYPED_TEST_SUITE(RepeatTests, CharacterTypes, );

View File

@ -284,47 +284,18 @@ arguments or results with conversion factors.
`SELECTED_LOGICAL_KIND` maps a bit size to a kind of `LOGICAL`
There are two new character utility intrinsic
functions whose implementations have very low priority: `SPLIT` and `TOKENIZE`.
`TOKENIZE` requires memory allocation to return its results,
and could and should have been implemented once in some Fortran utility
library for those who need a slow tokenization facility rather than
requiring implementations in each vendor's runtime support library with
all the extra cost and compatibility risk that entails.
There are two new character utility intrinsic subroutines,
`SPLIT` and `TOKENIZE`, both of which are now implemented.
`SPLIT` is worse -- not only could it, like `TOKENIZE`,
have been supplied by a Fortran utility library rather than being
added to the standard, it's redundant;
it provides nothing that cannot be already accomplished by
composing today's `SCAN` intrinsic function with substring indexing:
`SPLIT` scans for separator characters in a string.
When `BACK` is absent or false, it returns the position of the leftmost
character in `SET` whose position in `STRING` is greater than `POS`,
or `LEN(STRING)+1` if no such character exists.
When `BACK` is true, it returns the position of the rightmost character
in `SET` whose position in `STRING` is less than `POS`, or 0 if no
such character exists.
```
module m
interface split
module procedure :: split
end interface
!instantiate for all possible ck/ik/lk combinations
integer, parameter :: ck = kind(''), ik = kind(0), lk = kind(.true.)
contains
simple elemental subroutine split(string, set, pos, back)
character(*, kind=ck), intent(in) :: string, set
integer(kind=ik), intent(in out) :: pos
logical(kind=lk), intent(in), optional :: back
if (present(back)) then
if (back) then
pos = scan(string(:pos-1), set, .true.)
return
end if
end if
npos = scan(string(pos+1:), set)
pos = merge(pos + npos, len(string) + 1, npos /= 0)
end
end
```
(The code above isn't a proposed implementation for `SPLIT`, just a
demonstration of how programs could use `SCAN` to accomplish the same
results today.)
`TOKENIZE` extracts tokens from a string separated by characters in a set.
## Source limitations

View File

@ -39,7 +39,7 @@ status of all important Fortran 2023 features. The table entries are based on th
| The specifiers typeof and classof | N | |
| Conditional expressions and arguments | N | |
| More use of boz constants | P | All usages other than enum are supported |
| Intrinsics for extracting tokens from a string | N | |
| Intrinsics for extracting tokens from a string | Y | |
| Intrinsics for Trig functions that work in degrees | Y | |
| Intrinsics for Trig functions that work in half revolutions| Y | |
| Changes to system_clock | N | |

View File

@ -361,6 +361,24 @@ that is present in `SET`, or zero if none is.
`VERIFY` is essentially the opposite: it returns the index of the first (or last) character
in `STRING` that is *not* present in `SET`, or zero if all are.
### Character intrinsic subroutines (Fortran 2023)
```
CALL SPLIT(CHARACTER(k,n) STRING, CHARACTER(k,m) SET, INTEGER(any) POS, LOGICAL(any) BACK=.FALSE.)
CALL TOKENIZE(CHARACTER(k,n) STRING, CHARACTER(k,m) SET, CHARACTER(k,:) TOKENS(:) [, SEPARATOR])
CALL TOKENIZE(CHARACTER(k,n) STRING, CHARACTER(k,m) SET, INTEGER FIRST(:), INTEGER LAST(:))
```
`SPLIT` scans for separator characters in `STRING` from the set `SET`.
When `BACK` is absent or `.FALSE.`, it returns (in `POS`) the position of the
leftmost character in `SET` whose position in `STRING` is greater than `POS`,
or `LEN(STRING)+1` if no such character exists.
When `BACK` is `.TRUE.`, it returns the position of the rightmost character in
`SET` whose position in `STRING` is less than `POS`, or 0 if no such character exists.
`TOKENIZE` extracts tokens from `STRING` delimited by characters in `SET`.
In Form 1, it returns the tokens as an array of characters and optionally the separator characters.
In Form 2, it returns the starting and ending positions of each token.
## Transformational intrinsic functions
This category comprises a large collection of intrinsic functions that

View File

@ -390,6 +390,7 @@ struct IntrinsicLibrary {
fir::ExtendedValue genSizeOf(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
mlir::Value genSpacing(mlir::Type resultType,
llvm::ArrayRef<mlir::Value> args);
void genSplit(llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genSpread(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
fir::ExtendedValue genStorageSize(mlir::Type,
llvm::ArrayRef<fir::ExtendedValue>);

View File

@ -142,6 +142,15 @@ mlir::Value genVerify(fir::FirOpBuilder &builder, mlir::Location loc, int kind,
mlir::Value setBase, mlir::Value setLen,
mlir::Value back);
/// Generate call to the SPLIT runtime routine that is specialized on
/// \param kind.
/// The \param kind represents the kind of the elements in the strings.
/// Updates \p pos to the next separator position.
mlir::Value genSplit(fir::FirOpBuilder &builder, mlir::Location loc, int kind,
mlir::Value stringBase, mlir::Value stringLen,
mlir::Value setBase, mlir::Value setLen, mlir::Value pos,
mlir::Value back);
/// Generate call to TOKENIZE runtime (Form 1).
/// Splits \p stringBox into tokens based on separator characters in \p setBox.
/// \p tokensBox must be an unallocated allocatable array that receives the

View File

@ -137,6 +137,15 @@ void RTDECL(Tokenize)(Descriptor &tokens, Descriptor *separator,
void RTDECL(TokenizePositions)(Descriptor &first, Descriptor &last,
const Descriptor &string, const Descriptor &set,
const char *sourceFile = nullptr, int sourceLine = 0);
std::size_t RTDECL(Split1)(const char *string, std::size_t stringLen,
const char *set, std::size_t setLen, std::size_t pos, bool back = false);
std::size_t RTDECL(Split2)(const char16_t *string, std::size_t stringLen,
const char16_t *set, std::size_t setLen, std::size_t pos,
bool back = false);
std::size_t RTDECL(Split4)(const char32_t *string, std::size_t stringLen,
const char32_t *set, std::size_t setLen, std::size_t pos,
bool back = false);
}
} // namespace Fortran::runtime
#endif // FORTRAN_RUNTIME_CHARACTER_H_

View File

@ -1750,6 +1750,16 @@ static const IntrinsicInterface intrinsicSubroutine[]{
{{"seconds", AnyInt, Rank::scalar, Optionality::required,
common::Intent::In}},
{}, Rank::elemental, IntrinsicClass::impureSubroutine},
{"split",
{{"string", SameCharNoLen, Rank::scalar, Optionality::required,
common::Intent::In},
{"set", SameCharNoLen, Rank::scalar, Optionality::required,
common::Intent::In},
{"pos", AnyInt, Rank::scalar, Optionality::required,
common::Intent::InOut},
{"back", AnyLogical, Rank::scalar, Optionality::optional,
common::Intent::In}},
{}, Rank::elemental, IntrinsicClass::pureSubroutine},
{"tokenize",
{{"string", SameCharNoLen, Rank::scalar, Optionality::required,
common::Intent::In},

View File

@ -764,6 +764,13 @@ static constexpr IntrinsicHandler handlers[]{
/*isElemental=*/false},
{"sleep", &I::genSleep, {{{"seconds", asValue}}}, /*isElemental=*/false},
{"spacing", &I::genSpacing},
{"split",
&I::genSplit,
{{{"string", asAddr},
{"set", asAddr},
{"pos", asAddr},
{"back", asValue, handleDynamicOptional}}},
/*isElemental=*/false},
{"spread",
&I::genSpread,
{{{"source", asBox}, {"dim", asValue}, {"ncopies", asValue}}},
@ -8538,6 +8545,39 @@ void IntrinsicLibrary::genSleep(llvm::ArrayRef<fir::ExtendedValue> args) {
fir::runtime::genSleep(builder, loc, fir::getBase(args[0]));
}
// SPLIT
void IntrinsicLibrary::genSplit(llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4);
mlir::Value stringBase = fir::getBase(args[0]);
mlir::Value stringLen = fir::getLen(args[0]);
mlir::Value setBase = fir::getBase(args[1]);
mlir::Value setLen = fir::getLen(args[1]);
mlir::Value posAddr = fir::getBase(args[2]);
fir::KindTy kind =
fir::factory::CharacterExprHelper{builder, loc}.getCharacterKind(
stringBase.getType());
// BACK is optional and defaults to .FALSE. when absent.
mlir::Value back =
isStaticallyAbsent(args[3])
? builder.createIntegerConstant(loc, builder.getI1Type(), 0)
: fir::getBase(args[3]);
mlir::Type posRefTy = fir::dyn_cast_ptrEleTy(posAddr.getType());
mlir::Value posValue = fir::LoadOp::create(builder, loc, posRefTy, posAddr);
mlir::Type indexTy = builder.getIndexType();
mlir::Value posIndex = builder.createConvert(loc, indexTy, posValue);
mlir::Value newPos =
fir::runtime::genSplit(builder, loc, kind, stringBase, stringLen, setBase,
setLen, posIndex, back);
mlir::Value newPosConverted = builder.createConvert(loc, posRefTy, newPos);
fir::StoreOp::create(builder, loc, newPosConverted, posAddr);
}
// TOKENIZE
void IntrinsicLibrary::genTokenize(llvm::ArrayRef<fir::ExtendedValue> args) {
assert(args.size() == 4 && "TOKENIZE requires 3 or 4 arguments");

View File

@ -339,3 +339,30 @@ mlir::Value fir::runtime::genVerify(fir::FirOpBuilder &builder,
stringLen, setBase, setLen, back);
return fir::CallOp::create(builder, loc, func, args).getResult(0);
}
mlir::Value fir::runtime::genSplit(fir::FirOpBuilder &builder,
mlir::Location loc, int kind,
mlir::Value stringBase,
mlir::Value stringLen, mlir::Value setBase,
mlir::Value setLen, mlir::Value pos,
mlir::Value back) {
mlir::func::FuncOp func;
switch (kind) {
case 1:
func = fir::runtime::getRuntimeFunc<mkRTKey(Split1)>(loc, builder);
break;
case 2:
func = fir::runtime::getRuntimeFunc<mkRTKey(Split2)>(loc, builder);
break;
case 4:
func = fir::runtime::getRuntimeFunc<mkRTKey(Split4)>(loc, builder);
break;
default:
fir::emitFatalError(
loc, "unsupported CHARACTER kind value. Runtime expects 1, 2, or 4.");
}
auto fTy = func.getFunctionType();
auto args = fir::runtime::createArguments(
builder, loc, fTy, stringBase, stringLen, setBase, setLen, pos, back);
return fir::CallOp::create(builder, loc, func, args).getResult(0);
}

View File

@ -0,0 +1,45 @@
! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
! CHECK-LABEL: split_basic
subroutine split_basic()
implicit none
character(20) :: string
character(5) :: set
integer :: pos
string = "one,two,three"
set = ","
pos = 0
call split(string, set, pos)
! CHECK: %[[BACK:.*]] = arith.constant false
! CHECK: %[[POS:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[POS_IDX:.*]] = fir.convert %[[POS]] : (i32) -> index
! CHECK: %[[STRING:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,20>>) -> !fir.ref<i8>
! CHECK: %[[SET:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
! CHECK: %[[POS_I64:.*]] = fir.convert %[[POS_IDX]] : (index) -> i64
! CHECK: %[[RESULT:.*]] = fir.call @_FortranASplit1(%[[STRING]], %{{.*}}, %[[SET]], %{{.*}}, %[[POS_I64]], %[[BACK]]) {{.*}} : (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i64, i1) -> i64
! CHECK: %[[RESULT_I32:.*]] = fir.convert %[[RESULT]] : (i64) -> i32
! CHECK: fir.store %[[RESULT_I32]] to %{{.*}} : !fir.ref<i32>
end subroutine split_basic
! CHECK-LABEL: split_back
subroutine split_back()
implicit none
character(20) :: string
character(5) :: set
integer :: pos
logical :: back
string = "one,two,three"
set = ","
pos = 14
back = .true.
call split(string, set, pos, back)
! CHECK: %[[POS:.*]] = fir.load %{{.*}} : !fir.ref<i32>
! CHECK: %[[POS_IDX:.*]] = fir.convert %[[POS]] : (i32) -> index
! CHECK: %[[STRING:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,20>>) -> !fir.ref<i8>
! CHECK: %[[SET:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
! CHECK: %[[POS_I64:.*]] = fir.convert %[[POS_IDX]] : (index) -> i64
! CHECK: %[[BACK_CVT:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
! CHECK: %[[RESULT:.*]] = fir.call @_FortranASplit1(%[[STRING]], %{{.*}}, %[[SET]], %{{.*}}, %[[POS_I64]], %[[BACK_CVT]]) {{.*}} : (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i64, i1) -> i64
! CHECK: %[[RESULT_I32:.*]] = fir.convert %[[RESULT]] : (i64) -> i32
! CHECK: fir.store %[[RESULT_I32]] to %{{.*}} : !fir.ref<i32>
end subroutine split_back

View File

@ -0,0 +1,98 @@
! RUN: %python %S/test_errors.py %s %flang_fc1
! Check for semantic errors in split() subroutine calls
! Based on Fortran 2023 standard requirements
program test_split_errors
implicit none
character(20) :: string
character(5) :: set
integer :: pos
logical :: back
! Valid declarations for testing
integer :: int_scalar
real :: real_scalar
character(10) :: string_array(5)
character(5) :: set_array(5)
character(len=20, kind=2) :: string_k2
character(len=5, kind=2) :: set_k2
character(len=20, kind=4) :: string_k4
character(len=5, kind=4) :: set_k4
!========================================================================
! Valid calls (reference)
!========================================================================
call split(string, set, pos)
call split(string, set, pos, back)
call split("hello world", " ", pos)
call split("hello world", " ", pos, .false.)
! Valid calls with different character kinds
call split(string_k2, set_k2, pos)
call split(string_k2, set_k2, pos, back)
call split(string_k4, set_k4, pos)
call split(string_k4, set_k4, pos, back)
!========================================================================
! Wrong types for STRING argument
!========================================================================
!ERROR: Actual argument for 'string=' has bad type 'INTEGER(4)'
call split(int_scalar, set, pos)
!ERROR: Actual argument for 'string=' has bad type 'REAL(4)'
call split(real_scalar, set, pos)
!========================================================================
! Wrong rank for STRING (must be scalar)
!========================================================================
!ERROR: 'string=' argument has unacceptable rank 1
call split(string_array, set, pos)
!========================================================================
! Wrong types for SET argument
!========================================================================
!ERROR: Actual argument for 'set=' has bad type 'INTEGER(4)'
call split(string, int_scalar, pos)
!ERROR: Actual argument for 'set=' has bad type 'REAL(4)'
call split(string, real_scalar, pos)
!========================================================================
! Wrong types for POS argument
!========================================================================
!ERROR: Actual argument for 'pos=' has bad type 'REAL(4)'
call split(string, set, real_scalar)
!========================================================================
! Wrong types for BACK argument
!========================================================================
!ERROR: Actual argument for 'back=' has bad type 'INTEGER(4)'
call split(string, set, pos, int_scalar)
!========================================================================
! Character kind mismatches between STRING and SET
!========================================================================
!ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=1,LEN=5_8)'
call split(string_k2, set, pos)
!ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=2,LEN=5_8)'
call split(string, set_k2, pos)
!ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=1,LEN=5_8)'
call split(string_k4, set, pos)
!ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=4,LEN=5_8)'
call split(string, set_k4, pos)
!ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=4,LEN=5_8)'
call split(string_k2, set_k4, pos)
end program test_split_errors