diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index ee42801da09d..a41d8d882650 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -236,19 +236,25 @@ static bool isSafeToParallelize(Operation *op) {
 
   // Thread-local variables allocated in the OpenMP parallel region or coming
   // from privatizing clauses are private to each thread and thus safe (and
-  // sometimes required) to parallelize. If the compiler wraps such load/stores
-  // in an omp.single block, only one thread updates its local copy, while
-  // all other threads read uninitialized data (see issue #143330).
-  // Use MemoryEffectOpInterface to check all memory effects generically.
-  // This also handles hlfir.assign which is present when the pass runs before
-  // HLFIR lowering.
+  // sometimes required) to parallelize. If the compiler wraps stores to
+  // thread-local variables in an omp.single block, only one thread updates
+  // its local copy, while all other threads read uninitialized data (see
+  // issue #143330).
+  //
+  // Only WRITE effects to thread-local memory are considered safe here, not
+  // reads. If reads were also safe, the cascading effect in moveToSingle
+  // could cause entire SingleRegions to become fully parallelized (all ops
+  // safe), eliminating the omp.single and its implicit barrier. This removes
+  // synchronization points needed to keep threads coordinated inside
+  // sequential loops that contain workshared operations.
   if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
     SmallVector<MemoryEffects::EffectInstance> effects;
     memEffects.getEffects(effects);
     if (!effects.empty() &&
         llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) {
           Value val = effect.getValue();
-          return val && isOpenMPThreadLocalMemory(op, val);
+          return val && isa<MemoryEffects::Write>(effect.getEffect()) &&
+                 isOpenMPThreadLocalMemory(op, val);
         }))
       return true;
   }
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d046ef4cc46f..d6000c989515 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -119,6 +119,39 @@ func.func @hlfir_assign_private_clause(%arg0: !fir.ref<i32>) {
 // CHECK-NEXT:  }
 
 
+// -----
+
+// Check that hlfir.assign from a shared variable to a private variable is
+// NOT parallelized. The assign has a Read effect on the shared RHS, which
+// is not a write to thread-local memory, so isSafeToParallelize returns false.
+
+omp.private {type = private} @y_private : i32
+
+// CHECK-LABEL: func.func @hlfir_assign_shared_to_private
+func.func @hlfir_assign_shared_to_private(%arg0: !fir.ref<i32>, %shared: !fir.ref<i32>) {
+  omp.parallel private(@y_private %arg0 -> %priv_arg : !fir.ref<i32>) {
+    %decl:2 = hlfir.declare %priv_arg {uniq_name = "x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    omp.workshare {
+      // hlfir.assign with a shared RHS variable should stay in omp.single
+      hlfir.assign %shared to %decl#0 : !fir.ref<i32>, !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel private(@y_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT:    %[[DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]]
+// CHECK-NEXT:    omp.single nowait {
+// CHECK:           hlfir.assign %{{.*}} to %[[DECL]]#0 : !fir.ref<i32>, !fir.ref<i32>
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
 // Check that reduction clause block arguments are recognized as thread-local.
 
 omp.declare_reduction @add_reduction_i32 : i32 init {
@@ -263,7 +296,9 @@ func.func @non_thread_local_needs_single(%arg0: !fir.ref<i32>) {
 // CHECK-NEXT:  }
 
 
-// Check that loads from thread-local alloca combined with stores are parallelized.
+// Check that stores to thread-local alloca are parallelized, but loads stay in
+// omp.single. Only writes to thread-local memory are safe to parallelize;
+// reads must remain in the single to preserve synchronization barriers.
 
 // CHECK-LABEL: func.func @thread_local_load_and_store
 func.func @thread_local_load_and_store() {
@@ -281,14 +316,90 @@ func.func @thread_local_load_and_store() {
   return
 }
 
-// All operations on thread-local memory should be outside omp.single
+// The store to thread-local memory is parallelized (outside the single),
+// but the load remains inside the single to maintain synchronization.
 
 // CHECK:       omp.parallel {
 // CHECK-NEXT:    %[[ALLOCA:.*]] = fir.alloca i32
-// CHECK-NEXT:    %[[C1:.*]] = arith.constant 1 : i32
-// CHECK-NEXT:    fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
-// CHECK-NEXT:    %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
-// CHECK-NEXT:    fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:         omp.single nowait {
+// CHECK:           fir.store {{.*}} to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           fir.load %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           fir.store {{.*}} to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK:         fir.store {{.*}} to %[[ALLOCA]] : !fir.ref<i32>
 // CHECK-NEXT:    omp.barrier
 // CHECK-NEXT:    omp.terminator
 // CHECK-NEXT:  }
+
+
+// Check the forall-in-workshare pattern: a sequential loop (fir.do_loop) with
+// thread-local index stores, shared memory loads, and a workshare.loop_wrapper.
+// This models the lowered IR for:
+//   !$omp workshare
+//   forall (i=1:n) a(:,i) = a(:,i) + 1
+//   !$omp end workshare
+//
+// Loads from thread-local allocas must remain inside omp.single so that the
+// single's barrier preserves thread synchronization.
+// If reads were also safe-to-parallelize, the entire SingleRegion before the
+// workshare.loop_wrapper could become fully parallelized, eliminating the
+// omp.single and its implicit barrier. This caused race conditions on shared
+// runtime data structures in forall-workshare patterns (see issue #143330).
+
+// CHECK-LABEL: func.func @forall_pattern_in_workshare
+func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
+  omp.parallel {
+    %idx_alloca = fir.alloca i32 {bindc_name = "i", pinned}
+    omp.workshare {
+      %c1 = arith.constant 1 : index
+      %c10 = arith.constant 10 : index
+      fir.do_loop %iv = %c1 to %c10 step %c1 {
+        // Store loop index to thread-local alloca (must be parallelized)
+        %iv_i32 = fir.convert %iv : (index) -> i32
+        fir.store %iv_i32 to %idx_alloca : !fir.ref<i32>
+        // Load from shared memory (must stay in omp.single)
+        %shared_val = fir.load %shared : !fir.ref<i32>
+        // Load from thread-local alloca (must stay in omp.single to
+        // prevent SingleRegion elimination and barrier loss)
+        %idx_val = fir.load %idx_alloca : !fir.ref<i32>
+        // Side-effecting op using both values (must stay in omp.single)
+        "test.side_effect"(%shared_val, %idx_val) : (i32, i32) -> ()
+        // Workshared loop
+        omp.workshare.loop_wrapper {
+          omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
+            "test.inner"(%j) : (index) -> ()
+            omp.yield
+          }
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The thread-local load must remain inside omp.single along with the shared
+// load and side-effecting op (preserving the single's barrier). The
+// thread-local store is also cloned inside the single, but a parallel copy
+// is placed after it so all threads update their own alloca.
+// CHECK:       omp.parallel {
+// CHECK:         %[[IDX:.*]] = fir.alloca i32 {bindc_name = "i", pinned}
+// CHECK:         fir.do_loop
+// The single contains the shared load, thread-local load, and side effect.
+// The thread-local store is also cloned inside (harmless, one thread runs it).
+// CHECK:           omp.single {
+// CHECK:             fir.store {{.*}} to %[[IDX]] : !fir.ref<i32>
+// CHECK:             fir.load %{{.*}} : !fir.ref<i32>
+// CHECK:             fir.load %[[IDX]] : !fir.ref<i32>
+// CHECK:             "test.side_effect"
+// CHECK:             omp.terminator
+// CHECK-NEXT:      }
+// The parallelized copy of the store runs after the single's barrier,
+// ensuring all threads update their own thread-local index alloca.
+// CHECK:           fir.store {{.*}} to %[[IDX]] : !fir.ref<i32>
+// CHECK:           omp.wsloop {
+// CHECK:         }
+// CHECK:         omp.barrier
+// CHECK:       }