From ca0e7d31d05bea7b6d29a420e9b1c756a1845e95 Mon Sep 17 00:00:00 2001 From: kwyatt-ext Date: Fri, 27 Feb 2026 12:43:18 -0600 Subject: [PATCH] [flang] [flang-rt] Addition of the Fortran 2023 TOKENIZE intrinsic. (#181030) This implements the TOKENIZE intrinsic per the Fortran 2023 Standard. TOKENIZE is a more complicated addition to the flang intrinsics, as it is the first subroutine that has multiple unique footprints. Intrinsic functions have already addressed this challenge, however subroutines and functions are processed slightly differently and the function code was not a good 1:1 solution for the subroutines. To solve this the function code was used as an example to create error buffering within the intrinsics Process and select the most appropriate error message for a given subroutine footprint. A simple FIR compile test was added to show the proper compilation of each case. A thorough negative path test has also been added, ensuring that all possible errors are reported as expected. Testing prior to commit: = check-flang ========================================== ``` Testing Time: 139.51s Total Discovered Tests: 4153 Unsupported : 77 (1.85%) Passed : 4065 (97.88%) Expectedly Failed: 11 (0.26%) FLANG Container Test completed 2 minutes (160 s). Total Time: 2 minutes (160 s) Completed : Wed Feb 11 04:05:50 PM CST 2026 ``` = check-flang-rt ========================================== ``` Testing Time: 1.55s Total Discovered Tests: 258 Passed: 258 (100.00%) FLANG Container Test completed 0 minutes (55 s). Total Time: 0 minutes (56 s) Completed : Wed Feb 11 04:08:32 PM CST 2026 ``` = llvm-test-suite ========================================== ``` Testing Time: 1886.64s Total Discovered Tests: 6926 Passed: 6926 (100.00%) CCE SLES Container debug compile completed 31 minutes (1895 s). CCE SLES Container debug install completed in 0 minutes (0 s). Total Time: 31 minutes (1895 s) Completed : Wed Feb 11 05:46:52 PM CST 2026 ``` Additionally, (FYI) an executable test has been written and will be added to the llvm-test-suite under a separate PR. --------- Co-authored-by: Kevin Wyatt --- flang-rt/lib/runtime/character.cpp | 437 ++++++++++++++++++ flang-rt/unittests/Runtime/CharacterTest.cpp | 208 +++++++++ .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + .../Optimizer/Builder/Runtime/Character.h | 16 + flang/include/flang/Runtime/character.h | 7 + flang/lib/Evaluate/intrinsics.cpp | 107 ++++- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 102 ++++ .../Optimizer/Builder/Runtime/Character.cpp | 29 ++ flang/test/Lower/Intrinsics/tokenize.f90 | 27 ++ flang/test/Semantics/tokenize-errors.f90 | 267 +++++++++++ 10 files changed, 1200 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/Intrinsics/tokenize.f90 create mode 100644 flang/test/Semantics/tokenize-errors.f90 diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp index 04c9055f8449..a663643fa18c 100644 --- a/flang-rt/lib/runtime/character.cpp +++ b/flang-rt/lib/runtime/character.cpp @@ -571,6 +571,427 @@ static RT_API_ATTRS void MaxMin(Descriptor &accumulator, const Descriptor &x, } } +template +static inline RT_API_ATTRS bool TokenizeIsInSet( + CHAR ch, const CHAR *set, std::size_t setChars) { + for (std::size_t j{0}; j < setChars; ++j) { + if (set[j] == ch) { + return true; + } + } + return false; +} + +// Pad the token with spaces. +template +static inline RT_API_ATTRS void TokenizeFillBlanks( + CHAR *to, std::size_t chars) { + if (chars == 0) { + return; + } + if constexpr (std::is_same_v) { + runtime::memset(to, ' ', chars); + } else { + for (std::size_t j{0}; j < chars; ++j) { + to[j] = static_cast(' '); + } + } +} + +struct TokenizeAnalysis { + std::size_t tokenCount{0}; + std::size_t maxTokenLen{0}; // in characters +}; + +template +static RT_API_ATTRS TokenizeAnalysis AnalyzeTokenize(const CHAR *str, + std::size_t strChars, const CHAR *set, std::size_t setChars) { + TokenizeAnalysis analysis; + + // Empty STRING should return one empty token, per Fortran standard. + if (strChars == 0) { + analysis.tokenCount = 1; + analysis.maxTokenLen = 0; + return analysis; + } + if (setChars == 0) { + analysis.tokenCount = 1; + analysis.maxTokenLen = strChars; + return analysis; + } + + // Split STRING at each delimiter character. This produces empty tokens + // when delimiters are consecutive or when STRING starts/ends with a + // delimiter. + std::size_t tokenStart{0}; + for (std::size_t pos{0}; pos < strChars; ++pos) { + if (TokenizeIsInSet(str[pos], set, setChars)) { + analysis.maxTokenLen = std::max(analysis.maxTokenLen, pos - tokenStart); + analysis.tokenCount++; + tokenStart = pos + 1; + } + } + analysis.maxTokenLen = std::max(analysis.maxTokenLen, strChars - tokenStart); + analysis.tokenCount++; + + return analysis; +} +// Allocates and populates the result arrays for TOKENIZE Form 1. +template +static RT_API_ATTRS void TokenizeFillForm1(Descriptor &tokens, + Descriptor *separator, const Descriptor &string, const CHAR *str, + std::size_t strChars, const CHAR *set, std::size_t setChars, + const TokenizeAnalysis &analysis, Terminator &terminator) { + + // (Re)allocate TOKENS. + if (tokens.IsAllocated()) { + tokens.Deallocate(); + } + SubscriptValue tokensExtent[1]{ + static_cast(analysis.tokenCount)}; + std::size_t tokenElemBytes{ + analysis.tokenCount == 0 ? 0 : analysis.maxTokenLen * sizeof(CHAR)}; + tokens.Establish(string.type(), tokenElemBytes, nullptr, 1, tokensExtent, + CFI_attribute_allocatable); + tokens.GetDimension(0).SetBounds(1, tokensExtent[0]); + if (tokens.Allocate(kNoAsyncObject) != CFI_SUCCESS) { + terminator.Crash("TOKENIZE: could not allocate TOKENS array"); + } + + // (Re)allocate SEPARATOR if present. + std::size_t sepCount{analysis.tokenCount > 0 ? analysis.tokenCount - 1 : 0}; + std::size_t sepElemBytes{sizeof(CHAR)}; + if (separator) { + if (separator->IsAllocated()) { + separator->Deallocate(); + } + SubscriptValue sepExtent[1]{static_cast(sepCount)}; + separator->Establish(string.type(), sepElemBytes, nullptr, 1, sepExtent, + CFI_attribute_allocatable); + separator->GetDimension(0).SetBounds(1, sepExtent[0]); + if (separator->Allocate(kNoAsyncObject) != CFI_SUCCESS) { + terminator.Crash("TOKENIZE: could not allocate SEPARATOR array"); + } + } + + if (analysis.tokenCount == 0) { + return; + } + + // Populate tokens and separators. + if (setChars == 0) { + // One token (possibly empty) equal to STRING. + if (tokenElemBytes > 0) { + CHAR *tokDest{tokens.OffsetElement(0)}; + if (strChars > 0) { + runtime::memcpy(tokDest, str, strChars * sizeof(CHAR)); + } + TokenizeFillBlanks(tokDest + strChars, analysis.maxTokenLen - strChars); + } + return; + } + + std::size_t tokenIndex{0}; + std::size_t sepIndex{0}; + + auto storeToken = [&](std::size_t tokenStart, std::size_t tokenEnd) { + std::size_t tokenLen{tokenEnd - tokenStart}; + if (tokenElemBytes > 0) { + // Each element is stored in a fixed-size slot of `tokenElemBytes`. + CHAR *tokDest{tokens.OffsetElement(tokenIndex * tokenElemBytes)}; + if (tokenLen > 0) { + runtime::memcpy(tokDest, str + tokenStart, tokenLen * sizeof(CHAR)); + } + TokenizeFillBlanks(tokDest + tokenLen, analysis.maxTokenLen - tokenLen); + } + ++tokenIndex; + }; + + // Split at each delimiter character, producing empty tokens at boundaries + // and between consecutive delimiters. + std::size_t tokenStart{0}; + for (std::size_t pos{0}; pos < strChars; ++pos) { + if (TokenizeIsInSet(str[pos], set, setChars)) { + storeToken(tokenStart, pos); + if (separator) { + CHAR *sepDest{separator->OffsetElement(sepIndex * sepElemBytes)}; + sepDest[0] = str[pos]; + ++sepIndex; + } + tokenStart = pos + 1; + } + } + storeToken(tokenStart, strChars); +} + +template +static RT_API_ATTRS void TokenizeStoreIntAt( + const Descriptor &result, std::size_t at, std::int64_t value) { + StoreIntegerAt{}(result, at, value); +} + +using TokenizeStoreIntFn = void (*)( + const Descriptor &, std::size_t, std::int64_t); + +static RT_API_ATTRS TokenizeStoreIntFn GetTokenizeStoreIntFn( + int kind, Terminator &terminator, const char *which) { + switch (kind) { + case 1: + return &TokenizeStoreIntAt<1>; + case 2: + return &TokenizeStoreIntAt<2>; + case 4: + return &TokenizeStoreIntAt<4>; + case 8: + return &TokenizeStoreIntAt<8>; + case 16: + return &TokenizeStoreIntAt<16>; + default: + terminator.Crash( + "TOKENIZE: unsupported INTEGER kind=%d for %s", kind, which); + } +} + +template +static RT_API_ATTRS void TokenizeFillPositions(Descriptor &first, + Descriptor &last, const CHAR *str, std::size_t strChars, const CHAR *set, + std::size_t setChars, TokenizeStoreIntFn storeFirst, + TokenizeStoreIntFn storeLast, Terminator &terminator) { + + // Empty STRING should return one empty token, per Fortran standard. + if (strChars == 0) { + storeFirst(first, 0, 1); + storeLast(last, 0, 0); + return; + } + if (setChars == 0) { + storeFirst(first, 0, 1); + storeLast(last, 0, static_cast(strChars)); + return; + } + + std::size_t tokenIndex{0}; + std::size_t tokenStart{0}; + for (std::size_t pos{0}; pos < strChars; ++pos) { + if (TokenizeIsInSet(str[pos], set, setChars)) { + storeFirst(first, tokenIndex, static_cast(tokenStart + 1)); + storeLast(last, tokenIndex, static_cast(pos)); + ++tokenIndex; + tokenStart = pos + 1; + } + } + storeFirst(first, tokenIndex, static_cast(tokenStart + 1)); + storeLast(last, tokenIndex, static_cast(strChars)); + ++tokenIndex; + + // Sanity check: we should have filled exactly the allocated extent. + if (tokenIndex != static_cast(first.GetDimension(0).Extent())) { + terminator.Crash("TOKENIZE: internal error populating FIRST/LAST"); + } +} + +// Tokenize Form 1 implementation. +static RT_API_ATTRS void TokenizeImpl(Descriptor &tokens, Descriptor *separator, + const Descriptor &string, const Descriptor &set, Terminator &terminator) { + RUNTIME_CHECK(terminator, string.rank() == 0); + RUNTIME_CHECK(terminator, set.rank() == 0); + RUNTIME_CHECK(terminator, string.raw().type == set.raw().type); + RUNTIME_CHECK(terminator, tokens.rank() == 1); + RUNTIME_CHECK(terminator, tokens.IsAllocatable()); + if (separator) { + RUNTIME_CHECK(terminator, separator->rank() == 1); + RUNTIME_CHECK(terminator, separator->IsAllocatable()); + } + + switch (string.raw().type) { + case CFI_type_char: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes}; + std::size_t setChars{setBytes}; + const char *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + auto analysis{AnalyzeTokenize(str, strChars, setPtr, setChars)}; + TokenizeFillForm1(tokens, separator, string, str, strChars, setPtr, + setChars, analysis, terminator); + break; + } + case CFI_type_char16_t: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes >> 1}; + std::size_t setChars{setBytes >> 1}; + const char16_t *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char16_t *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + auto analysis{AnalyzeTokenize(str, strChars, setPtr, setChars)}; + TokenizeFillForm1(tokens, separator, string, str, strChars, setPtr, + setChars, analysis, terminator); + break; + } + case CFI_type_char32_t: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes >> 2}; + std::size_t setChars{setBytes >> 2}; + const char32_t *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char32_t *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + auto analysis{AnalyzeTokenize(str, strChars, setPtr, setChars)}; + TokenizeFillForm1(tokens, separator, string, str, strChars, setPtr, + setChars, analysis, terminator); + break; + } + default: + terminator.Crash("TOKENIZE: bad string type code %d", + static_cast(string.raw().type)); + } +} + +// Tokenize Form 2 implementation. +static RT_API_ATTRS void TokenizePositionsImpl(Descriptor &first, + Descriptor &last, const Descriptor &string, const Descriptor &set, + Terminator &terminator) { + RUNTIME_CHECK(terminator, string.rank() == 0); + RUNTIME_CHECK(terminator, set.rank() == 0); + RUNTIME_CHECK(terminator, string.raw().type == set.raw().type); + RUNTIME_CHECK(terminator, first.rank() == 1); + RUNTIME_CHECK(terminator, last.rank() == 1); + RUNTIME_CHECK(terminator, first.IsAllocatable()); + RUNTIME_CHECK(terminator, last.IsAllocatable()); + + auto firstCK{first.type().GetCategoryAndKind()}; + auto lastCK{last.type().GetCategoryAndKind()}; + if (!firstCK || firstCK->first != TypeCategory::Integer) { + terminator.Crash("TOKENIZE: FIRST is not an INTEGER array"); + } + if (!lastCK || lastCK->first != TypeCategory::Integer) { + terminator.Crash("TOKENIZE: LAST is not an INTEGER array"); + } + int firstKind{firstCK->second}; + int lastKind{lastCK->second}; + auto storeFirst{GetTokenizeStoreIntFn(firstKind, terminator, "FIRST")}; + auto storeLast{GetTokenizeStoreIntFn(lastKind, terminator, "LAST")}; + + // Count tokens. + std::size_t tokenCount{0}; + switch (string.raw().type) { + case CFI_type_char: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes}; + std::size_t setChars{setBytes}; + const char *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + tokenCount = AnalyzeTokenize(str, strChars, setPtr, setChars).tokenCount; + break; + } + case CFI_type_char16_t: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes >> 1}; + std::size_t setChars{setBytes >> 1}; + const char16_t *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char16_t *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + tokenCount = AnalyzeTokenize(str, strChars, setPtr, setChars).tokenCount; + break; + } + case CFI_type_char32_t: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes >> 2}; + std::size_t setChars{setBytes >> 2}; + const char32_t *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char32_t *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + tokenCount = AnalyzeTokenize(str, strChars, setPtr, setChars).tokenCount; + break; + } + default: + terminator.Crash("TOKENIZE: bad string type code %d", + static_cast(string.raw().type)); + } + + // (Re)allocate FIRST/LAST. + if (first.IsAllocated()) { + first.Deallocate(); + } + if (last.IsAllocated()) { + last.Deallocate(); + } + SubscriptValue extent[1]{static_cast(tokenCount)}; + first.Establish(TypeCategory::Integer, firstKind, nullptr, 1, extent, + CFI_attribute_allocatable); + first.GetDimension(0).SetBounds(1, extent[0]); + last.Establish(TypeCategory::Integer, lastKind, nullptr, 1, extent, + CFI_attribute_allocatable); + last.GetDimension(0).SetBounds(1, extent[0]); + if (first.Allocate(kNoAsyncObject) != CFI_SUCCESS) { + terminator.Crash("TOKENIZE: could not allocate FIRST array"); + } + if (last.Allocate(kNoAsyncObject) != CFI_SUCCESS) { + terminator.Crash("TOKENIZE: could not allocate LAST array"); + } + + if (tokenCount == 0) { + return; + } + + // Populate FIRST/LAST. + switch (string.raw().type) { + case CFI_type_char: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes}; + std::size_t setChars{setBytes}; + const char *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + TokenizeFillPositions(first, last, str, strChars, setPtr, setChars, + storeFirst, storeLast, terminator); + break; + } + case CFI_type_char16_t: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes >> 1}; + std::size_t setChars{setBytes >> 1}; + const char16_t *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char16_t *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + TokenizeFillPositions(first, last, str, strChars, setPtr, setChars, + storeFirst, storeLast, terminator); + break; + } + case CFI_type_char32_t: { + std::size_t strBytes{string.ElementBytes()}; + std::size_t setBytes{set.ElementBytes()}; + std::size_t strChars{strBytes >> 2}; + std::size_t setChars{setBytes >> 2}; + const char32_t *str{ + strBytes == 0 ? nullptr : string.OffsetElement()}; + const char32_t *setPtr{ + setBytes == 0 ? nullptr : set.OffsetElement()}; + TokenizeFillPositions(first, last, str, strChars, setPtr, setChars, + storeFirst, storeLast, terminator); + break; + } + default: + break; + } +} + extern "C" { RT_EXT_API_GROUP_BEGIN @@ -938,6 +1359,22 @@ void RTDEF(CharacterMin)(Descriptor &accumulator, const Descriptor &x, MaxMin(accumulator, x, sourceFile, sourceLine); } +// TOKENIZE Form 1 entry point +void RTDEF(Tokenize)(Descriptor &tokens, Descriptor *separator, + const Descriptor &string, const Descriptor &set, const char *sourceFile, + int sourceLine) { + Terminator terminator{sourceFile, sourceLine}; + TokenizeImpl(tokens, separator, string, set, terminator); +} + +// TOKENIZE Form 2 entry point +void RTDEF(TokenizePositions)(Descriptor &first, Descriptor &last, + const Descriptor &string, const Descriptor &set, const char *sourceFile, + int sourceLine) { + Terminator terminator{sourceFile, sourceLine}; + TokenizePositionsImpl(first, last, string, set, terminator); +} + RT_EXT_API_GROUP_END } } // namespace Fortran::runtime diff --git a/flang-rt/unittests/Runtime/CharacterTest.cpp b/flang-rt/unittests/Runtime/CharacterTest.cpp index 5d8bcd2354c4..4b304a98ada1 100644 --- a/flang-rt/unittests/Runtime/CharacterTest.cpp +++ b/flang-rt/unittests/Runtime/CharacterTest.cpp @@ -431,6 +431,214 @@ TYPED_TEST(RepeatTests, Repeat) { } } +// Test TOKENIZE() - Form 1 and Form 2 +// Helper to create a scalar character descriptor from a raw C string. +template +OwningPtr CreateScalarDescriptor(const char *raw) { + std::size_t len{std::strlen(raw)}; + OwningPtr desc{Descriptor::Create( + sizeof(CHAR), len, nullptr, 0, nullptr, CFI_attribute_other)}; + if (desc->Allocate(kNoAsyncObject) != 0) { + return nullptr; + } + std::basic_string converted{raw, raw + len}; + std::copy(converted.begin(), converted.end(), desc->OffsetElement(0)); + return desc; +} + +// Helper to create an unallocated allocatable character descriptor (rank 1, +// deferred length) for TOKENS or SEPARATOR output. +template StaticDescriptor<1> CreateAllocatableCharDescriptor() { + StaticDescriptor<1> staticDesc; + Descriptor &desc{staticDesc.descriptor()}; + desc.Establish(static_cast(sizeof(CHAR)), static_cast(0), + nullptr, 1, nullptr, CFI_attribute_allocatable); + desc.GetDimension(0).SetBounds(1, 0); + return staticDesc; +} + +// Helper to create an unallocated allocatable integer descriptor (rank 1) +// for FIRST or LAST output. +static StaticDescriptor<1> CreateAllocatableIntDescriptor() { + StaticDescriptor<1> staticDesc; + Descriptor &desc{staticDesc.descriptor()}; + desc.Establish( + TypeCategory::Integer, 4, nullptr, 1, nullptr, CFI_attribute_allocatable); + desc.GetDimension(0).SetBounds(1, 0); + return staticDesc; +} + +// Helper to extract a token string from the TOKENS descriptor. +template +std::basic_string GetToken(Descriptor &tokens, std::size_t index) { + std::size_t elemBytes{tokens.ElementBytes()}; + std::size_t charLen{elemBytes / sizeof(CHAR)}; + const CHAR *data{tokens.OffsetElement(index * elemBytes)}; + return std::basic_string(data, charLen); +} + +template struct TokenizeTests : public ::testing::Test {}; +TYPED_TEST_SUITE(TokenizeTests, CharacterTypes, ); + +// Form 1: basic tokenization +TYPED_TEST(TokenizeTests, Form1Basic) { + auto string{CreateScalarDescriptor("first,second,third")}; + auto set{CreateScalarDescriptor(",")}; + ASSERT_NE(string, nullptr); + ASSERT_NE(set, nullptr); + + auto tokensStatic{CreateAllocatableCharDescriptor()}; + Descriptor &tokens{tokensStatic.descriptor()}; + + RTNAME(Tokenize)(tokens, nullptr, *string, *set); + + // Expect 3 tokens: "first", "second", "third" + ASSERT_TRUE(tokens.IsAllocated()); + EXPECT_EQ(tokens.GetDimension(0).Extent(), 3); + // Longest token is "second" (6 chars) + EXPECT_EQ(tokens.ElementBytes(), 6u * sizeof(TypeParam)); + + // Tokens are blank-padded to max length + std::basic_string t0{GetToken(tokens, 0)}; + std::basic_string t1{GetToken(tokens, 1)}; + std::basic_string t2{GetToken(tokens, 2)}; + std::basic_string e0{'f', 'i', 'r', 's', 't', ' '}; + std::basic_string e1{'s', 'e', 'c', 'o', 'n', 'd'}; + std::basic_string e2{'t', 'h', 'i', 'r', 'd', ' '}; + EXPECT_EQ(t0, e0); + EXPECT_EQ(t1, e1); + EXPECT_EQ(t2, e2); + tokens.Deallocate(); +} + +// Form 1: empty string produces one zero-length token +TYPED_TEST(TokenizeTests, Form1EmptyString) { + auto string{CreateScalarDescriptor("")}; + auto set{CreateScalarDescriptor(",")}; + ASSERT_NE(string, nullptr); + ASSERT_NE(set, nullptr); + + auto tokensStatic{CreateAllocatableCharDescriptor()}; + Descriptor &tokens{tokensStatic.descriptor()}; + + RTNAME(Tokenize)(tokens, nullptr, *string, *set); + + ASSERT_TRUE(tokens.IsAllocated()); + EXPECT_EQ(tokens.GetDimension(0).Extent(), 1) << "empty string = 1 token"; + EXPECT_EQ(tokens.ElementBytes(), 0u) << "token length should be 0"; + EXPECT_EQ(tokens.GetDimension(0).LowerBound(), 1); + tokens.Deallocate(); +} + +// Form 1: consecutive delimiters produce empty tokens +TYPED_TEST(TokenizeTests, Form1ConsecutiveDelimiters) { + auto string{CreateScalarDescriptor("a,,b")}; + auto set{CreateScalarDescriptor(",")}; + ASSERT_NE(string, nullptr); + ASSERT_NE(set, nullptr); + + auto tokensStatic{CreateAllocatableCharDescriptor()}; + Descriptor &tokens{tokensStatic.descriptor()}; + + RTNAME(Tokenize)(tokens, nullptr, *string, *set); + + ASSERT_TRUE(tokens.IsAllocated()); + // Expect 3 tokens: "a", "", "b" + EXPECT_EQ(tokens.GetDimension(0).Extent(), 3); + tokens.Deallocate(); +} + +// Form 1: with SEPARATOR output +TYPED_TEST(TokenizeTests, Form1WithSeparator) { + auto string{CreateScalarDescriptor("a,b;c")}; + auto set{CreateScalarDescriptor(",;")}; + ASSERT_NE(string, nullptr); + ASSERT_NE(set, nullptr); + + auto tokensStatic{CreateAllocatableCharDescriptor()}; + Descriptor &tokens{tokensStatic.descriptor()}; + auto sepStatic{CreateAllocatableCharDescriptor()}; + Descriptor &separator{sepStatic.descriptor()}; + + RTNAME(Tokenize)(tokens, &separator, *string, *set); + + // Expect 3 tokens: "a", "b", "c" + ASSERT_TRUE(tokens.IsAllocated()); + EXPECT_EQ(tokens.GetDimension(0).Extent(), 3); + ASSERT_TRUE(separator.IsAllocated()); + // Expect 2 separators: ',' then ';' + EXPECT_EQ(separator.GetDimension(0).Extent(), 2); + EXPECT_EQ(separator.ElementBytes(), sizeof(TypeParam)); + + // Check separator values: ',' then ';' + const TypeParam *sep0{separator.OffsetElement(0)}; + const TypeParam *sep1{ + separator.OffsetElement(separator.ElementBytes())}; + EXPECT_EQ(*sep0, static_cast(',')); + EXPECT_EQ(*sep1, static_cast(';')); + tokens.Deallocate(); + separator.Deallocate(); +} + +// Form 2: basic position output +TYPED_TEST(TokenizeTests, Form2Basic) { + // From the standard example: "first,second,,fourth" + auto string{CreateScalarDescriptor("first,second,,fourth")}; + auto set{CreateScalarDescriptor(",;")}; + ASSERT_NE(string, nullptr); + ASSERT_NE(set, nullptr); + + auto firstStatic{CreateAllocatableIntDescriptor()}; + Descriptor &first{firstStatic.descriptor()}; + auto lastStatic{CreateAllocatableIntDescriptor()}; + Descriptor &last{lastStatic.descriptor()}; + + RTNAME(TokenizePositions)(first, last, *string, *set); + + ASSERT_TRUE(first.IsAllocated()); + ASSERT_TRUE(last.IsAllocated()); + EXPECT_EQ(first.GetDimension(0).Extent(), 4); + EXPECT_EQ(last.GetDimension(0).Extent(), 4); + + // Expect: FIRST = [1, 7, 14, 15], LAST = [5, 12, 13, 20] + EXPECT_EQ(*first.OffsetElement(0 * sizeof(std::int32_t)), 1); + EXPECT_EQ(*first.OffsetElement(1 * sizeof(std::int32_t)), 7); + EXPECT_EQ(*first.OffsetElement(2 * sizeof(std::int32_t)), 14); + EXPECT_EQ(*first.OffsetElement(3 * sizeof(std::int32_t)), 15); + EXPECT_EQ(*last.OffsetElement(0 * sizeof(std::int32_t)), 5); + EXPECT_EQ(*last.OffsetElement(1 * sizeof(std::int32_t)), 12); + EXPECT_EQ(*last.OffsetElement(2 * sizeof(std::int32_t)), 13); + EXPECT_EQ(*last.OffsetElement(3 * sizeof(std::int32_t)), 20); + first.Deallocate(); + last.Deallocate(); +} + +// Form 2: empty string produces one token with FIRST=1, LAST=0 +TYPED_TEST(TokenizeTests, Form2EmptyString) { + auto string{CreateScalarDescriptor("")}; + auto set{CreateScalarDescriptor(",")}; + ASSERT_NE(string, nullptr); + ASSERT_NE(set, nullptr); + + auto firstStatic{CreateAllocatableIntDescriptor()}; + Descriptor &first{firstStatic.descriptor()}; + auto lastStatic{CreateAllocatableIntDescriptor()}; + Descriptor &last{lastStatic.descriptor()}; + + RTNAME(TokenizePositions)(first, last, *string, *set); + + ASSERT_TRUE(first.IsAllocated()); + ASSERT_TRUE(last.IsAllocated()); + EXPECT_EQ(first.GetDimension(0).Extent(), 1) << "empty string = 1 token"; + EXPECT_EQ(last.GetDimension(0).Extent(), 1); + + // Expect FIRST(1)=1, LAST(1)=0 + EXPECT_EQ(*first.OffsetElement(0), 1) << "FIRST(1) = 1"; + EXPECT_EQ(*last.OffsetElement(0), 0) << "LAST(1) = 0"; + first.Deallocate(); + last.Deallocate(); +} + // Test F_C_STRING() TEST(CharacterTests, FCString) { // Test 1: Default behavior (trim trailing blanks) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index edc6d1bf8e25..753d475b4de5 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -447,6 +447,7 @@ struct IntrinsicLibrary { fir::ExtendedValue genTeamNumber(mlir::Type, llvm::ArrayRef); mlir::Value genTime(mlir::Type, llvm::ArrayRef); + void genTokenize(llvm::ArrayRef); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genTransfer(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Character.h b/flang/include/flang/Optimizer/Builder/Runtime/Character.h index b7181e33d688..684b7498e725 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Character.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Character.h @@ -142,6 +142,22 @@ mlir::Value genVerify(fir::FirOpBuilder &builder, mlir::Location loc, int kind, mlir::Value setBase, mlir::Value setLen, mlir::Value back); +/// Generate call to TOKENIZE runtime (Form 1). +/// Splits \p stringBox into tokens based on separator characters in \p setBox. +/// \p tokensBox must be an unallocated allocatable array that receives the +/// token substrings. \p separatorBox is optional and receives separator chars. +void genTokenize(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value tokensBox, mlir::Value separatorBox, + mlir::Value stringBox, mlir::Value setBox); + +/// Generate call to TOKENIZE runtime (Form 2). +/// Returns token positions rather than substrings. +/// \p firstBox and \p lastBox must be unallocated allocatable integer arrays +/// that receive the starting and ending positions of each token. +void genTokenizePositions(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value firstBox, mlir::Value lastBox, + mlir::Value stringBox, mlir::Value setBox); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CHARACTER_H diff --git a/flang/include/flang/Runtime/character.h b/flang/include/flang/Runtime/character.h index 36a66fc50e71..360418b7d553 100644 --- a/flang/include/flang/Runtime/character.h +++ b/flang/include/flang/Runtime/character.h @@ -130,6 +130,13 @@ std::size_t RTDECL(Verify4)(const char32_t *, std::size_t, const char32_t *set, void RTDECL(Verify)(Descriptor &result, const Descriptor &string, const Descriptor &set, const Descriptor *back /*can be null*/, int kind, const char *sourceFile = nullptr, int sourceLine = 0); + +void RTDECL(Tokenize)(Descriptor &tokens, Descriptor *separator, + const Descriptor &string, const Descriptor &set, + const char *sourceFile = nullptr, int sourceLine = 0); +void RTDECL(TokenizePositions)(Descriptor &first, Descriptor &last, + const Descriptor &string, const Descriptor &set, + const char *sourceFile = nullptr, int sourceLine = 0); } } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_CHARACTER_H_ diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 9eaad6c04926..2ae1c478489c 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -22,6 +22,7 @@ #include "flang/Support/Fortran.h" #include "llvm/Support/raw_ostream.h" #include +#include #include #include #include @@ -1749,6 +1750,26 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {{"seconds", AnyInt, Rank::scalar, Optionality::required, common::Intent::In}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"tokenize", + {{"string", SameCharNoLen, Rank::scalar, Optionality::required, + common::Intent::In}, + {"set", SameCharNoLen, Rank::scalar, Optionality::required, + common::Intent::In}, + {"tokens", SameCharNoLen, Rank::vector, Optionality::required, + common::Intent::Out}, + {"separator", SameCharNoLen, Rank::vector, Optionality::optional, + common::Intent::Out}}, + {}, Rank::elemental, IntrinsicClass::pureSubroutine}, + {"tokenize", + {{"string", SameCharNoLen, Rank::scalar, Optionality::required, + common::Intent::In}, + {"set", SameCharNoLen, Rank::scalar, Optionality::required, + common::Intent::In}, + {"first", AnyInt, Rank::vector, Optionality::required, + common::Intent::Out}, + {"last", AnyInt, Rank::vector, Optionality::required, + common::Intent::Out}}, + {}, Rank::elemental, IntrinsicClass::pureSubroutine}, {"unlink", {{"path", DefaultChar, Rank::scalar, Optionality::required, common::Intent::In}, @@ -3704,6 +3725,24 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) { arg ? arg->sourceLocation() : context.messages().at(), "Argument of LOC() must be an object or procedure"_err_en_US); } + } else if (name == "tokenize") { + // Both forms of TOKENIZE have at least 4 dummy arguments, and the last two + // must be allocatable. + const auto &dummies{ + call.specificIntrinsic.characteristics.value().dummyArguments}; + for (int i{2}; i < 4; ++i) { + const auto &arg{call.arguments[i]}; + if (arg) { + if (const auto *expr{arg->UnwrapExpr()}) { + if (!IsAllocatableDesignator(*expr)) { + ok = false; + context.messages().Say(arg->sourceLocation(), + "'%s=' argument to 'tokenize' must be ALLOCATABLE"_err_en_US, + dummies[i].name); + } + } + } + } } return ok; } @@ -3766,15 +3805,81 @@ std::optional IntrinsicProcTable::Implementation::Probe( } } + // Find the specific subroutine and match the actual arguments against its + // dummy argument patterns. If there are multiple specific subroutines with + // the same name, try them in order. If one matches, clear out the errors. + // If none match, keep the messages from the form whose dummy argument + // types and keywords best match the actual arguments supplied. if (call.isSubroutineCall) { const std::string &name{ResolveAlias(call.name)}; auto subrRange{subroutines_.equal_range(name)}; + parser::Messages subrErrors; + int bestScore{INT_MIN}; + parser::Messages localBuffer; + parser::Messages *finalBuffer{context.messages().messages()}; + parser::ContextualMessages localMessages{ + context.messages().at(), finalBuffer ? &localBuffer : nullptr}; + FoldingContext localContext{context, localMessages}; for (auto iter{subrRange.first}; iter != subrRange.second; ++iter) { if (auto specificCall{iter->second->Match( - call, defaults_, arguments, context, builtinsScope_)}) { + call, defaults_, arguments, localContext, builtinsScope_)}) { + if (finalBuffer) { + finalBuffer->Annex(std::move(localBuffer)); + } ApplySpecificChecks(*specificCall, context); return specificCall; } + // Match failed. Compute a score reflecting how well the actual + // arguments correspond to this form's dummy arguments: count the + // number of positional arguments whose type category matches the + // corresponding dummy's expected categories, plus the number of + // keyword arguments whose keyword name matches a dummy in this form, + // minus the number of required dummies that cannot be satisfied by + // the number of arguments provided. Keep the error messages from + // the form with the highest score, preferring an earlier form on + // ties. + const auto *iface{iter->second}; + int dummyCount{iface->CountArguments()}; + int numRequired{0}; + for (int j{0}; j < dummyCount; ++j) { + if (iface->dummy[j].optionality == Optionality::required) { + ++numRequired; + } + } + int score{0}; + int positionalIndex{0}; + for (const auto &arg : arguments) { + if (arg) { + if (auto kw{arg->keyword()}) { + for (int k{0}; k < dummyCount; ++k) { + if (kw == iface->dummy[k].keyword) { + ++score; + break; + } + } + } else { + if (positionalIndex < dummyCount) { + if (auto type{arg->GetType()}) { + if (iface->dummy[positionalIndex].typePattern.categorySet.test( + type->category())) { + ++score; + } + } + } + ++positionalIndex; + } + } + } + score -= std::max(0, numRequired - static_cast(arguments.size())); + if (score > bestScore) { + bestScore = score; + subrErrors = std::move(localBuffer); + } else { + localBuffer.clear(); + } + } + if (finalBuffer) { + finalBuffer->Annex(std::move(subrErrors)); } if (IsIntrinsicFunction(call.name) && !IsDualIntrinsic(call.name)) { context.messages().Say( diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index f71ef0f12c99..090236fe3e5c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -797,6 +797,13 @@ static constexpr IntrinsicHandler handlers[]{ {"team", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, {"time", &I::genTime, {}, /*isElemental=*/false}, + {"tokenize", + &I::genTokenize, + {{{"string", asAddr}, + {"set", asAddr}, + {"out1", asInquired}, + {"out2", asInquired, handleDynamicOptional}}}, + /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -8529,6 +8536,101 @@ void IntrinsicLibrary::genSleep(llvm::ArrayRef args) { fir::runtime::genSleep(builder, loc, fir::getBase(args[0])); } +// TOKENIZE +void IntrinsicLibrary::genTokenize(llvm::ArrayRef args) { + assert(args.size() == 4 && "TOKENIZE requires 3 or 4 arguments"); + + const fir::ExtendedValue &string = args[0]; + const fir::ExtendedValue &set = args[1]; + + // Distinguish forms by the element type of the third argument. For form 1, + // TOKENS is CHARACTER. For form 2, FIRST is INTEGER. + mlir::Type thirdArgEleTy = fir::getElementTypeOf(args[2]); + bool isForm1 = fir::isa_char(thirdArgEleTy); + bool isForm2 = fir::isa_integer(thirdArgEleTy); + assert((isForm1 || isForm2) && + "TOKENIZE third argument must be CHARACTER or INTEGER"); + + mlir::Value stringBox = builder.createBox(loc, string); + mlir::Value setBox = builder.createBox(loc, set); + + mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType()); + mlir::Type boxNoneRefTy = fir::ReferenceType::get(boxNoneTy); + + // A lambda to return the address of the descriptor storage to pass to the + // runtime. For MutableBoxValue, this also handles any required syncing + // before/after the runtime call. + auto getBoxStorageAddr = + [&](const fir::ExtendedValue &exv, llvm::StringRef what, + const fir::MutableBoxValue **mutableBoxOut) -> mlir::Value { + if (const auto *mb = exv.getBoxOf()) { + if (mutableBoxOut) + *mutableBoxOut = mb; + mlir::Value addr = fir::factory::getMutableIRBox(builder, loc, *mb); + return builder.createConvert(loc, boxNoneRefTy, addr); + } + if (const auto *bv = exv.getBoxOf()) { + mlir::Value addr = bv->getAddr(); + if (auto boxTy = fir::dyn_cast_ptrEleTy(addr.getType())) { + if (mlir::isa(boxTy)) + return builder.createConvert(loc, boxNoneRefTy, addr); + } + fir::emitFatalError(loc, llvm::Twine("TOKENIZE: ") + what + + " must be a descriptor address"); + } + fir::emitFatalError(loc, llvm::Twine("TOKENIZE: ") + what + + " not lowered as a boxed entity"); + }; + + if (isForm1) { + // Form 1: TOKENIZE(STRING, SET, TOKENS [, SEPARATOR]) + const fir::ExtendedValue &tokens = args[2]; + const fir::MutableBoxValue *tokensMutableBox{nullptr}; + mlir::Value tokensBoxAddr = + getBoxStorageAddr(tokens, "TOKENS", &tokensMutableBox); + + // Handle optional SEPARATOR argument + mlir::Value separatorBoxAddr; + const fir::MutableBoxValue *separatorMutableBox{nullptr}; + if (!isStaticallyAbsent(args[3])) { + const fir::ExtendedValue &separator = args[3]; + separatorBoxAddr = + getBoxStorageAddr(separator, "SEPARATOR", &separatorMutableBox); + } else { + separatorBoxAddr = builder.createNullConstant(loc, boxNoneRefTy); + } + + // Call the Form 1 runtime function + fir::runtime::genTokenize(builder, loc, tokensBoxAddr, separatorBoxAddr, + stringBox, setBox); + + if (tokensMutableBox) + fir::factory::syncMutableBoxFromIRBox(builder, loc, *tokensMutableBox); + if (separatorMutableBox) + fir::factory::syncMutableBoxFromIRBox(builder, loc, *separatorMutableBox); + + } else { + // Form 2: TOKENIZE(STRING, SET, FIRST, LAST) + const fir::ExtendedValue &first = args[2]; + const fir::ExtendedValue &last = args[3]; + + const fir::MutableBoxValue *firstMutableBox{nullptr}; + const fir::MutableBoxValue *lastMutableBox{nullptr}; + mlir::Value firstBoxAddr = + getBoxStorageAddr(first, "FIRST", &firstMutableBox); + mlir::Value lastBoxAddr = getBoxStorageAddr(last, "LAST", &lastMutableBox); + + // Call the Form 2 runtime function + fir::runtime::genTokenizePositions(builder, loc, firstBoxAddr, lastBoxAddr, + stringBox, setBox); + + if (firstMutableBox) + fir::factory::syncMutableBoxFromIRBox(builder, loc, *firstMutableBox); + if (lastMutableBox) + fir::factory::syncMutableBoxFromIRBox(builder, loc, *lastMutableBox); + } +} + // TRANSFER fir::ExtendedValue IntrinsicLibrary::genTransfer(mlir::Type resultType, diff --git a/flang/lib/Optimizer/Builder/Runtime/Character.cpp b/flang/lib/Optimizer/Builder/Runtime/Character.cpp index e297125880f7..28e795b8de75 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Character.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Character.cpp @@ -285,6 +285,35 @@ void fir::runtime::genVerifyDescriptor(fir::FirOpBuilder &builder, kind); } +void fir::runtime::genTokenize(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value tokensBox, mlir::Value separatorBox, + mlir::Value stringBox, mlir::Value setBox) { + auto func = fir::runtime::getRuntimeFunc(loc, builder); + auto fTy = func.getFunctionType(); + auto sourceFile = fir::factory::locationToFilename(builder, loc); + auto sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(5)); + auto args = + fir::runtime::createArguments(builder, loc, fTy, tokensBox, separatorBox, + stringBox, setBox, sourceFile, sourceLine); + fir::CallOp::create(builder, loc, func, args); +} + +void fir::runtime::genTokenizePositions( + fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value firstBox, + mlir::Value lastBox, mlir::Value stringBox, mlir::Value setBox) { + auto func = + fir::runtime::getRuntimeFunc(loc, builder); + auto fTy = func.getFunctionType(); + auto sourceFile = fir::factory::locationToFilename(builder, loc); + auto sourceLine = + fir::factory::locationToLineNo(builder, loc, fTy.getInput(5)); + auto args = + fir::runtime::createArguments(builder, loc, fTy, firstBox, lastBox, + stringBox, setBox, sourceFile, sourceLine); + fir::CallOp::create(builder, loc, func, args); +} + mlir::Value fir::runtime::genVerify(fir::FirOpBuilder &builder, mlir::Location loc, int kind, mlir::Value stringBase, diff --git a/flang/test/Lower/Intrinsics/tokenize.f90 b/flang/test/Lower/Intrinsics/tokenize.f90 new file mode 100644 index 000000000000..ef533794a5a3 --- /dev/null +++ b/flang/test/Lower/Intrinsics/tokenize.f90 @@ -0,0 +1,27 @@ +! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +! CHECK-LABEL: tok_form1 +subroutine tok_form1() + implicit none + character(:), allocatable :: tokens(:) + call tokenize("a,b", ",", tokens) + ! CHECK-DAG: %[[TOKENS:.*]] = fir.alloca !fir.box>>> + ! CHECK-DAG: %[[TOKENS_DECL:.*]]:2 = hlfir.declare %[[TOKENS]] {fortran_attrs = #fir.var_attrs + ! CHECK-DAG: %[[TOKENS_NONE:.*]] = fir.convert %[[TOKENS_DECL]]#0 : (!fir.ref>>>>) -> !fir.ref> + ! CHECK-DAG: %[[SEP_NONE:.*]] = fir.zero_bits !fir.ref> + ! CHECK: fir.call @_FortranATokenize(%[[TOKENS_NONE]], %[[SEP_NONE]], +end subroutine tok_form1 + +! CHECK-LABEL: tok_form2 +subroutine tok_form2() + implicit none + integer, allocatable :: first(:), last(:) + call tokenize("a,,b", ",", first, last) + ! CHECK-DAG: %[[FIRST:.*]] = fir.alloca !fir.box>> + ! CHECK-DAG: %[[LAST:.*]] = fir.alloca !fir.box>> + ! CHECK-DAG: %[[FIRST_DECL:.*]]:2 = hlfir.declare %[[FIRST]] {fortran_attrs = #fir.var_attrs + ! CHECK-DAG: %[[LAST_DECL:.*]]:2 = hlfir.declare %[[LAST]] {fortran_attrs = #fir.var_attrs + ! CHECK-DAG: %[[FIRST_NONE:.*]] = fir.convert %[[FIRST_DECL]]#0 : (!fir.ref>>>) -> !fir.ref> + ! CHECK-DAG: %[[LAST_NONE:.*]] = fir.convert %[[LAST_DECL]]#0 : (!fir.ref>>>) -> !fir.ref> + ! CHECK: fir.call @_FortranATokenizePositions(%[[FIRST_NONE]], %[[LAST_NONE]], +end subroutine tok_form2 diff --git a/flang/test/Semantics/tokenize-errors.f90 b/flang/test/Semantics/tokenize-errors.f90 new file mode 100644 index 000000000000..204894817e90 --- /dev/null +++ b/flang/test/Semantics/tokenize-errors.f90 @@ -0,0 +1,267 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +! Check for semantic errors in tokenize() subroutine calls +! Based on Fortran 2023 standard requirements + +program test_tokenize_errors + implicit none + + ! Valid declarations for reference + character(:), allocatable :: tokens(:), separator(:) + integer, allocatable :: first(:), last(:) + character(20) :: string + character(5) :: set + + ! Invalid declarations for testing + integer :: int_scalar, int_array(10) + real :: real_scalar + character(10) :: fixed_tokens(10), fixed_separator(10) + integer :: fixed_first(10), fixed_last(10) + character(10) :: string_array(5) + character(5) :: set_array(5) + character(len=10, kind=2) :: wide_string, wide_set + character(len=10, kind=2), allocatable :: wide_tokens(:) + real, allocatable :: real_first(:), real_last(:) + logical, allocatable :: logical_array(:) + integer, allocatable :: first_2d(:,:), last_2d(:,:) + character(:), allocatable :: tokens_2d(:,:) + + type t_coarray + integer, allocatable :: a(:) + end type + type(t_coarray) :: coa[*] + integer, allocatable :: coindexed_first(:)[:] + integer, allocatable :: coindexed_last(:)[:] + character(:), allocatable :: coindexed_tokens(:)[:] + + !======================================================================== + ! Test Form 1: TOKENIZE(STRING, SET, TOKENS [, SEPARATOR]) + !======================================================================== + + ! Valid call (reference) + call tokenize("hello world", " ", tokens) + call tokenize(string, set, tokens, separator) + + !======================================================================== + ! Form 1: Wrong types for STRING argument + !======================================================================== + + !ERROR: Actual argument for 'string=' has bad type 'INTEGER(4)' + call tokenize(int_scalar, set, tokens) + + !ERROR: Actual argument for 'string=' has bad type 'REAL(4)' + call tokenize(real_scalar, set, tokens) + + !======================================================================== + ! Form 1: Wrong rank for STRING (must be scalar) + !======================================================================== + ! Fails + !ERROR: 'string=' argument has unacceptable rank 1 + call tokenize(string_array, set, tokens) + + !======================================================================== + ! Form 1: Wrong types for SET argument + !======================================================================== + + !ERROR: Actual argument for 'set=' has bad type 'INTEGER(4)' + call tokenize(string, int_scalar, tokens) + + !ERROR: Actual argument for 'set=' has bad type 'REAL(4)' + call tokenize(string, real_scalar, tokens) + + !======================================================================== + ! Form 1: Wrong rank for SET (must be scalar) + !======================================================================== + ! Fails + !ERROR: 'set=' argument has unacceptable rank 1 + call tokenize(string, set_array, tokens) + + !======================================================================== + ! Form 1: Wrong types for TOKENS argument + !======================================================================== + ! Fails + !ERROR: Actual argument for 'tokens=' has bad type 'INTEGER(4)' + call tokenize(string, set, int_array) + ! Fails + !ERROR: Actual argument for 'tokens=' has bad type 'REAL(4)' + call tokenize(string, set, real_first) + ! Fails + !ERROR: Actual argument for 'tokens=' has bad type 'LOGICAL(4)' + call tokenize(string, set, logical_array) + + !======================================================================== + ! Form 1: Wrong rank for TOKENS (must be rank-1 array) + !======================================================================== + ! Fails + !ERROR: 'tokens=' argument has unacceptable rank 0 + call tokenize(string, set, string) + ! Fails + !ERROR: 'tokens=' argument has unacceptable rank 2 + call tokenize(string, set, tokens_2d) + + !======================================================================== + ! Form 1: TOKENS must be allocatable + !======================================================================== + + !ERROR: 'tokens=' argument to 'tokenize' must be ALLOCATABLE + call tokenize(string, set, fixed_tokens) + + !======================================================================== + ! Form 1: Wrong types for optional SEPARATOR argument + !======================================================================== + ! Fails + !ERROR: Actual argument for 'separator=' has bad type 'INTEGER(4)' + call tokenize(string, set, tokens, int_array) + ! Fails + !ERROR: Actual argument for 'separator=' has bad type 'REAL(4)' + call tokenize(string, set, tokens, real_first) + + !======================================================================== + ! Form 1: Wrong rank for SEPARATOR (must be rank-1 array) + !======================================================================== + ! Fails + !ERROR: 'separator=' argument has unacceptable rank 0 + call tokenize(string, set, tokens, set) + + !======================================================================== + ! Form 1: SEPARATOR must be allocatable + !======================================================================== + + !ERROR: 'separator=' argument to 'tokenize' must be ALLOCATABLE + call tokenize(string, set, tokens, fixed_separator) + + !======================================================================== + ! Form 1: Character kind mismatches + !======================================================================== + ! Fails + ! wide_string (kind=2) becomes sameArg; set (kind=1) fails sameKind check + !ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=1,LEN=5_8)' + call tokenize(wide_string, set, tokens) + ! Fails + !ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=2,LEN=10_8)' + call tokenize(string, wide_set, tokens) + ! Fails + !ERROR: Actual argument for 'tokens=' has bad type or kind 'CHARACTER(KIND=2,LEN=10_8)' + call tokenize(string, set, wide_tokens) + + !======================================================================== + ! Test Form 2: TOKENIZE(STRING, SET, FIRST, LAST) + !======================================================================== + + ! Valid call (reference) + call tokenize("hello world", " ", first, last) + + !======================================================================== + ! Form 2: Wrong types for STRING argument (same as Form 1) + !======================================================================== + + !ERROR: Actual argument for 'string=' has bad type 'INTEGER(4)' + call tokenize(int_scalar, set, first, last) + + !======================================================================== + ! Form 2: Wrong types for SET argument (same as Form 1) + !======================================================================== + + !ERROR: Actual argument for 'set=' has bad type 'INTEGER(4)' + call tokenize(string, int_scalar, first, last) + + !======================================================================== + ! Form 2: Wrong types for FIRST argument + !======================================================================== + + !ERROR: Actual argument for 'first=' has bad type 'REAL(4)' + call tokenize(string, set, real_first, last) + + !ERROR: Actual argument for 'first=' has bad type 'LOGICAL(4)' + call tokenize(string, set, logical_array, last) + + !======================================================================== + ! Form 2: Wrong rank for FIRST (must be rank-1 array) + !======================================================================== + + !ERROR: 'first=' argument has unacceptable rank 0 + call tokenize(string, set, int_scalar, last) + + !ERROR: 'first=' argument has unacceptable rank 2 + call tokenize(string, set, first_2d, last) + + !======================================================================== + ! Form 2: FIRST must be allocatable + !======================================================================== + + !ERROR: 'first=' argument to 'tokenize' must be ALLOCATABLE + call tokenize(string, set, fixed_first, last) + + !======================================================================== + ! Form 2: Wrong types for LAST argument + !======================================================================== + + !ERROR: Actual argument for 'last=' has bad type 'REAL(4)' + call tokenize(string, set, first, real_first) + + !======================================================================== + ! Form 2: Wrong rank for LAST (must be rank-1 array) + !======================================================================== + + !ERROR: 'last=' argument has unacceptable rank 0 + call tokenize(string, set, first, int_scalar) + + !ERROR: 'last=' argument has unacceptable rank 2 + call tokenize(string, set, first, last_2d) + + !======================================================================== + ! Form 2: LAST must be allocatable + !======================================================================== + + !ERROR: 'last=' argument to 'tokenize' must be ALLOCATABLE + call tokenize(string, set, first, fixed_last) + + !======================================================================== + ! Argument count errors + !======================================================================== + + !ERROR: missing mandatory 'set=' argument + call tokenize(string) + ! Fails + !ERROR: missing mandatory 'tokens=' argument + call tokenize(string, set) + + !ERROR: too many actual arguments for intrinsic 'tokenize' + call tokenize(string, set, tokens, separator, first) + + !======================================================================== + ! Coindexed object restrictions (if applicable) + !======================================================================== + + ! Note: Coarray tests depend on whether the standard allows coindexed + ! objects for TOKENIZE. Uncomment if compiler version enforces this. + + ! !ERROR: 'first=' argument to 'tokenize' may not be a coindexed object + ! call tokenize(string, set, coindexed_first[1], last) + + ! !ERROR: 'last=' argument to 'tokenize' may not be a coindexed object + ! call tokenize(string, set, first, coindexed_last[1]) + + ! !ERROR: 'tokens=' argument to 'tokenize' may not be a coindexed object + ! call tokenize(string, set, coindexed_tokens[1]) + + !======================================================================== + ! Keyword argument errors + !======================================================================== + + !ERROR: unknown keyword argument to intrinsic 'tokenize' + call tokenize(string, set, tokens, invalid_keyword=separator) + ! Fails + !ERROR: Actual argument for 'tokens=' has bad type 'INTEGER(4)' + call tokenize(string=string, set=set, tokens=first, separator=separator) + + !======================================================================== + ! Type/kind inconsistency between STRING, SET, TOKENS, SEPARATOR + !======================================================================== + + ! All character arguments must have the same kind (but can have different lengths) + ! This is implicitly handled by SameCharNoLen in the intrinsic definition + ! Fails + !ERROR: Actual argument for 'set=' has bad type or kind 'CHARACTER(KIND=2,LEN=10_8)' + call tokenize(string=string, set=wide_set, tokens=tokens) + +end program test_tokenize_errors