Coverage for peakipy/io.py: 93%

509 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-15 20:54 -0400

1import sys 

2from pathlib import Path 

3from enum import Enum 

4 

5import numpy as np 

6import nmrglue as ng 

7import pandas as pd 

8import textwrap 

9from rich import print 

10from rich.console import Console 

11 

12 

13from bokeh.palettes import Category20 

14from scipy import ndimage 

15from skimage.morphology import binary_closing, disk, footprint_rectangle 

16from skimage.filters import threshold_otsu 

17from pydantic import BaseModel 

18 

19from peakipy.utils import df_to_rich_table 

20from peakipy.fitting import make_mask 

21 

22console = Console() 

23 

24 

25class StrucEl(str, Enum): 

26 square = "square" 

27 disk = "disk" 

28 rectangle = "rectangle" 

29 mask_method = "mask_method" 

30 

31 

32class PeaklistFormat(str, Enum): 

33 a2 = "a2" 

34 a3 = "a3" 

35 sparky = "sparky" 

36 pipe = "pipe" 

37 peakipy = "peakipy" 

38 csv = "csv" 

39 

40 

41class OutFmt(str, Enum): 

42 csv = "csv" 

43 pkl = "pkl" 

44 

45 

46class PeaklistColumns(BaseModel): 

47 """These are the columns required for performing fits in peakipy""" 

48 

49 INDEX: int 

50 X_AXIS: int 

51 Y_AXIS: int 

52 X_AXISf: float 

53 Y_AXISf: float 

54 X_PPM: float 

55 Y_PPM: float 

56 XW: float 

57 YW: float 

58 XW_HZ: float 

59 YW_HZ: float 

60 HEIGHT: float 

61 VOL: float 

62 ASS: str 

63 X_RADIUS: float 

64 Y_RADIUS: float 

65 X_RADIUS_PPM: float 

66 Y_RADIUS_PPM: float 

67 include: str 

68 

69 

70class PeaklistColumnsWithClusters(PeaklistColumns): 

71 CLUSTID: int 

72 MEMCNT: int 

73 color: str 

74 

75 

76class Pseudo3D: 

77 """Read dic, data from NMRGlue and dims from input to create a Pseudo3D dataset 

78 

79 :param dic: from nmrglue.pipe.read 

80 :type dic: dict 

81 

82 :param data: data from nmrglue.pipe.read 

83 :type data: numpy.array 

84 

85 :param dims: dimension order i.e [0,1,2] where 0 = planes, 1 = f1, 2 = f2 

86 :type dims: list 

87 """ 

88 

89 def __init__(self, dic, data, dims): 

90 # check dimensions 

91 self._udic = ng.pipe.guess_udic(dic, data) 

92 self._ndim = self._udic["ndim"] 

93 

94 if self._ndim == 1: 

95 err = f"""[red] 

96 ########################################## 

97 NMR Data should be either 2D or 3D 

98 ########################################## 

99 [/red]""" 

100 # raise TypeError(err) 

101 sys.exit(err) 

102 

103 # check that spectrum has correct number of dims 

104 elif self._ndim != len(dims): 

105 err = f"""[red] 

106 ################################################################# 

107 Your spectrum has {self._ndim} dimensions with shape {data.shape} 

108 but you have given a dimension order of {dims}... 

109 ################################################################# 

110 [/red]""" 

111 # raise ValueError(err) 

112 sys.exit(err) 

113 

114 elif (self._ndim == 2) and (len(dims) == 2): 

115 self._f1_dim, self._f2_dim = dims 

116 self._planes = 0 

117 self._uc_f1 = ng.pipe.make_uc(dic, data, dim=self._f1_dim) 

118 self._uc_f2 = ng.pipe.make_uc(dic, data, dim=self._f2_dim) 

119 # make data pseudo3d 

120 self._data = data.reshape((1, data.shape[0], data.shape[1])) 

121 self._dims = [self._planes, self._f1_dim + 1, self._f2_dim + 1] 

122 

123 else: 

124 self._planes, self._f1_dim, self._f2_dim = dims 

125 self._dims = dims 

126 self._data = data 

127 # make unit conversion dicts 

128 self._uc_f2 = ng.pipe.make_uc(dic, data, dim=self._f2_dim) 

129 self._uc_f1 = ng.pipe.make_uc(dic, data, dim=self._f1_dim) 

130 

131 #  rearrange data if dims not in standard order 

132 if self._dims != [0, 1, 2]: 

133 # np.argsort returns indices of array for order 0,1,2 to transpose data correctly 

134 # self._dims = np.argsort(self._dims) 

135 self._data = np.transpose(data, self._dims) 

136 

137 self._dic = dic 

138 

139 self._f1_label = self._udic[self._f1_dim]["label"] 

140 self._f2_label = self._udic[self._f2_dim]["label"] 

141 

142 @property 

143 def uc_f1(self): 

144 """Return unit conversion dict for F1""" 

145 return self._uc_f1 

146 

147 @property 

148 def uc_f2(self): 

149 """Return unit conversion dict for F2""" 

150 return self._uc_f2 

151 

152 @property 

153 def dims(self): 

154 """Return dimension order""" 

155 return self._dims 

156 

157 @property 

158 def data(self): 

159 """Return array containing data""" 

160 return self._data 

161 

162 @data.setter 

163 def data(self, data): 

164 self._data = data 

165 

166 @property 

167 def dic(self): 

168 return self._dic 

169 

170 @property 

171 def udic(self): 

172 return self._udic 

173 

174 @property 

175 def ndim(self): 

176 return self._ndim 

177 

178 @property 

179 def f1_label(self): 

180 # dim label 

181 return self._f1_label 

182 

183 @property 

184 def f2_label(self): 

185 # dim label 

186 return self._f2_label 

187 

188 @property 

189 def planes(self): 

190 return self.dims[0] 

191 

192 @property 

193 def n_planes(self): 

194 return self.data.shape[self.planes] 

195 

196 @property 

197 def f1(self): 

198 return self.dims[1] 

199 

200 @property 

201 def f2(self): 

202 return self.dims[2] 

203 

204 # size of f1 and f2 in points 

205 @property 

206 def f2_size(self): 

207 """Return size of f2 dimension in points""" 

208 return self._udic[self._f2_dim]["size"] 

209 

210 @property 

211 def f1_size(self): 

212 """Return size of f1 dimension in points""" 

213 return self._udic[self._f1_dim]["size"] 

214 

215 # points per ppm 

216 @property 

217 def pt_per_ppm_f1(self): 

218 return self.f1_size / ( 

219 self._udic[self._f1_dim]["sw"] / self._udic[self._f1_dim]["obs"] 

220 ) 

221 

222 @property 

223 def pt_per_ppm_f2(self): 

224 return self.f2_size / ( 

225 self._udic[self._f2_dim]["sw"] / self._udic[self._f2_dim]["obs"] 

226 ) 

227 

228 # points per hz 

229 @property 

230 def pt_per_hz_f1(self): 

231 return self.f1_size / self._udic[self._f1_dim]["sw"] 

232 

233 @property 

234 def pt_per_hz_f2(self): 

235 return self.f2_size / self._udic[self._f2_dim]["sw"] 

236 

237 # hz per point 

238 @property 

239 def hz_per_pt_f1(self): 

240 return 1.0 / self.pt_per_hz_f1 

241 

242 @property 

243 def hz_per_pt_f2(self): 

244 return 1.0 / self.pt_per_hz_f2 

245 

246 # ppm per point 

247 @property 

248 def ppm_per_pt_f1(self): 

249 return 1.0 / self.pt_per_ppm_f1 

250 

251 @property 

252 def ppm_per_pt_f2(self): 

253 return 1.0 / self.pt_per_ppm_f2 

254 

255 # get ppm limits for ppm scales 

256 @property 

257 def f2_ppm_scale(self): 

258 return self.uc_f2.ppm_scale() 

259 

260 @property 

261 def f1_ppm_scale(self): 

262 return self.uc_f1.ppm_scale() 

263 

264 @property 

265 def f2_ppm_limits(self): 

266 return self.uc_f2.ppm_limits() 

267 

268 @property 

269 def f1_ppm_limits(self): 

270 return self.uc_f1.ppm_limits() 

271 

272 @property 

273 def f1_ppm_max(self): 

274 return max(self.f1_ppm_limits) 

275 

276 @property 

277 def f1_ppm_min(self): 

278 return min(self.f1_ppm_limits) 

279 

280 @property 

281 def f2_ppm_max(self): 

282 return max(self.f2_ppm_limits) 

283 

284 @property 

285 def f2_ppm_min(self): 

286 return min(self.f2_ppm_limits) 

287 

288 @property 

289 def f2_ppm_0(self): 

290 return self.f2_ppm_limits[0] 

291 

292 @property 

293 def f2_ppm_1(self): 

294 return self.f2_ppm_limits[1] 

295 

296 @property 

297 def f1_ppm_0(self): 

298 return self.f1_ppm_limits[0] 

299 

300 @property 

301 def f1_ppm_1(self): 

302 return self.f1_ppm_limits[1] 

303 

304 

305class UnknownFormat(Exception): 

306 pass 

307 

308 

309 

310class Peaklist(Pseudo3D): 

311 """Read analysis, sparky or NMRPipe peak list and convert to NMRPipe-ish format also find peak clusters 

312 

313 Parameters 

314 ---------- 

315 path : path-like or str 

316 path to peaklist 

317 data_path : ndarray 

318 NMRPipe format data 

319 fmt : str 

320 a2|a3|sparky|pipe 

321 dims: list 

322 [planes,y,x] 

323 radii: list 

324 [x,y] Mask radii in ppm 

325 

326 

327 Methods 

328 ------- 

329 

330 clusters : 

331 mask_method : 

332 adaptive_clusters : 

333 

334 Returns 

335 ------- 

336 df : pandas DataFrame 

337 dataframe containing peaklist 

338 

339 """ 

340 

341 def __init__( 

342 self, 

343 path, 

344 data_path, 

345 fmt: PeaklistFormat = PeaklistFormat.a2, 

346 dims=[0, 1, 2], 

347 radii=[0.04, 0.4], 

348 posF1="Position F2", 

349 posF2="Position F1", 

350 verbose=False, 

351 ): 

352 dic, data = ng.pipe.read(data_path) 

353 Pseudo3D.__init__(self, dic, data, dims) 

354 self.fmt = fmt 

355 self.peaklist_path = path 

356 self.data_path = data_path 

357 self.verbose = verbose 

358 self._radii = radii 

359 self._thres = None 

360 if self.verbose: 

361 print( 

362 "Points per hz f1 = %.3f, f2 = %.3f" 

363 % (self.pt_per_hz_f1, self.pt_per_hz_f2) 

364 ) 

365 

366 self._analysis_to_pipe_dic = { 

367 "#": "INDEX", 

368 "Position F1": "X_PPM", 

369 "Position F2": "Y_PPM", 

370 "Line Width F1 (Hz)": "XW_HZ", 

371 "Line Width F2 (Hz)": "YW_HZ", 

372 "Height": "HEIGHT", 

373 "Volume": "VOL", 

374 } 

375 self._assign_to_pipe_dic = { 

376 "#": "INDEX", 

377 "Pos F1": "X_PPM", 

378 "Pos F2": "Y_PPM", 

379 "LW F1 (Hz)": "XW_HZ", 

380 "LW F2 (Hz)": "YW_HZ", 

381 "Height": "HEIGHT", 

382 "Volume": "VOL", 

383 } 

384 

385 self._sparky_to_pipe_dic = { 

386 "index": "INDEX", 

387 "w1": "X_PPM", 

388 "w2": "Y_PPM", 

389 "lw1 (hz)": "XW_HZ", 

390 "lw2 (hz)": "YW_HZ", 

391 "Height": "HEIGHT", 

392 "Volume": "VOL", 

393 "Assignment": "ASS", 

394 } 

395 

396 self._analysis_to_pipe_dic[posF1] = "Y_PPM" 

397 self._analysis_to_pipe_dic[posF2] = "X_PPM" 

398 

399 self._df = self.read_peaklist() 

400 

401 def read_peaklist(self): 

402 match self.fmt: 

403 case self.fmt.a2: 

404 self._df = self._read_analysis() 

405 

406 case self.fmt.a3: 

407 self._df = self._read_assign() 

408 

409 case self.fmt.sparky: 

410 self._df = self._read_sparky() 

411 

412 case self.fmt.pipe: 

413 self._df = self._read_pipe() 

414 

415 case self.fmt.csv: 

416 self._df = self._read_csv() 

417 

418 case _: 

419 raise UnknownFormat("I don't know this format: {self.fmt}") 

420 

421 return self._df 

422 

423 @property 

424 def df(self): 

425 return self._df 

426 

427 @df.setter 

428 def df(self, df): 

429 self._df = df 

430 return self._df 

431 

432 @property 

433 def radii(self): 

434 return self._radii 

435 

436 def check_radius_contains_enough_points_for_fitting(self, radius, pt_per_ppm, flag): 

437 if (radius * pt_per_ppm) < 2.0: 

438 new_radius = 2.0 * (1./ pt_per_ppm) 

439 print( 

440 "\n", 

441 f"[red]Warning: {flag} is set to {radius:.3f} ppm which is {radius * pt_per_ppm:.3f} points[/red]" + "\n", 

442 f"[yellow]Setting to 2 points which is {new_radius:.3f} ppm[/yellow]" + "\n", 

443 f"[yellow]Consider increasing this value to improve robustness of fitting (or increase zero filling)[/yellow]" + "\n", 

444 ) 

445 else: 

446 new_radius = radius 

447 return new_radius 

448 

449 @property 

450 def f2_radius(self): 

451 """radius for fitting mask in f2""" 

452 _f2_radius = self.check_radius_contains_enough_points_for_fitting(self.radii[0], self.pt_per_ppm_f2, "--x-radius-ppm") 

453 return _f2_radius 

454 

455 @property 

456 def f1_radius(self): 

457 """radius for fitting mask in f1""" 

458 _f1_radius = self.check_radius_contains_enough_points_for_fitting(self.radii[1], self.pt_per_ppm_f1, "--y-radius-ppm") 

459 return _f1_radius 

460 

461 @property 

462 def analysis_to_pipe_dic(self): 

463 return self._analysis_to_pipe_dic 

464 

465 @property 

466 def assign_to_pipe_dic(self): 

467 return self._assign_to_pipe_dic 

468 

469 @property 

470 def sparky_to_pipe_dic(self): 

471 return self._sparky_to_pipe_dic 

472 

473 @property 

474 def thres(self): 

475 if self._thres == None: 

476 self._thres = abs(threshold_otsu(self.data[0])) 

477 return self._thres 

478 else: 

479 return self._thres 

480 

481 def validate_peaklist(self): 

482 self.df = pd.DataFrame( 

483 [ 

484 PeaklistColumns(**i).model_dump() 

485 for i in self.df.to_dict(orient="records") 

486 ] 

487 ) 

488 return self.df 

489 

490 def update_df(self): 

491 # int point value 

492 self.df["X_AXIS"] = self.df.X_PPM.apply(lambda x: self.uc_f2(x, "ppm")) 

493 self.df["Y_AXIS"] = self.df.Y_PPM.apply(lambda x: self.uc_f1(x, "ppm")) 

494 # decimal point value 

495 self.df["X_AXISf"] = self.df.X_PPM.apply(lambda x: self.uc_f2.f(x, "ppm")) 

496 self.df["Y_AXISf"] = self.df.Y_PPM.apply(lambda x: self.uc_f1.f(x, "ppm")) 

497 # in case of missing values (should estimate though) 

498 self.df["XW_HZ"] = self.df.XW_HZ.replace("None", "20.0") 

499 self.df["YW_HZ"] = self.df.YW_HZ.replace("None", "20.0") 

500 self.df["XW_HZ"] = self.df.XW_HZ.replace(np.nan, "20.0") 

501 self.df["YW_HZ"] = self.df.YW_HZ.replace(np.nan, "20.0") 

502 # convert linewidths to float 

503 self.df["XW_HZ"] = self.df.XW_HZ.apply(lambda x: float(x)) 

504 self.df["YW_HZ"] = self.df.YW_HZ.apply(lambda x: float(x)) 

505 # convert Hz lw to points 

506 self.df["XW"] = self.df.XW_HZ.apply(lambda x: x * self.pt_per_hz_f2) 

507 self.df["YW"] = self.df.YW_HZ.apply(lambda x: x * self.pt_per_hz_f1) 

508 # makes an assignment column from Assign F1 and Assign F2 columns 

509 # in analysis2.x and ccpnmr v3 assign peak lists 

510 if self.fmt in [PeaklistFormat.a2, PeaklistFormat.a3]: 

511 self.df["ASS"] = self.df.apply( 

512 # lambda i: "".join([i["Assign F1"], i["Assign F2"]]), axis=1 

513 lambda i: f"{i['Assign F1']}_{i['Assign F2']}", 

514 axis=1, 

515 ) 

516 

517 # make default values for X and Y radii for fit masks 

518 self.df["X_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f2_radius 

519 self.df["Y_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f1_radius 

520 self.df["X_RADIUS"] = self.df.X_RADIUS_PPM.apply( 

521 lambda x: x * self.pt_per_ppm_f2 

522 ) 

523 self.df["Y_RADIUS"] = self.df.Y_RADIUS_PPM.apply( 

524 lambda x: x * self.pt_per_ppm_f1 

525 ) 

526 # add include column 

527 if "include" in self.df.columns: 

528 pass 

529 else: 

530 self.df["include"] = self.df.apply(lambda x: "yes", axis=1) 

531 

532 # check assignments for duplicates 

533 self.check_assignments() 

534 # check that peaks are within the bounds of the data 

535 self.check_peak_bounds() 

536 self.validate_peaklist() 

537 

538 def add_fix_bound_columns(self): 

539 """add columns containing parameter bounds (param_upper/param_lower) 

540 and whether or not parameter should be fixed (yes/no) 

541 

542 For parameter bounding: 

543 

544 Column names are <param_name>_upper and <param_name>_lower for upper and lower bounds respectively. 

545 Values are given as floating point. Value of 0.0 indicates that parameter is unbounded 

546 X/Y positions are given in ppm 

547 Linewidths are given in Hz 

548 

549 For parameter fixing: 

550 

551 Column names are <param_name>_fix. 

552 Values are given as a string 'yes' or 'no' 

553 

554 """ 

555 pass 

556 

557 def _read_analysis(self): 

558 df = pd.read_csv(self.peaklist_path, delimiter="\t") 

559 new_columns = [self.analysis_to_pipe_dic.get(i, i) for i in df.columns] 

560 pipe_columns = dict(zip(df.columns, new_columns)) 

561 df = df.rename(index=str, columns=pipe_columns) 

562 

563 return df 

564 

565 def _read_assign(self): 

566 df = pd.read_csv(self.peaklist_path, delimiter="\t") 

567 new_columns = [self.assign_to_pipe_dic.get(i, i) for i in df.columns] 

568 pipe_columns = dict(zip(df.columns, new_columns)) 

569 df = df.rename(index=str, columns=pipe_columns) 

570 

571 return df 

572 

573 def _read_sparky(self): 

574 df = pd.read_csv( 

575 self.peaklist_path, 

576 skiprows=1, 

577 sep=r"\s+", 

578 names=["ASS", "Y_PPM", "X_PPM"], 

579 # use only first three columns 

580 usecols=[i for i in range(3)], 

581 ) 

582 df["INDEX"] = df.index 

583 # need to add LW estimate 

584 df["XW_HZ"] = 20.0 

585 df["YW_HZ"] = 20.0 

586 # dummy values 

587 df["HEIGHT"] = 0.0 

588 df["VOL"] = 0.0 

589 return df 

590 

591 def _read_pipe(self): 

592 to_skip = 0 

593 with open(self.peaklist_path) as f: 

594 lines = f.readlines() 

595 for line in lines: 

596 if line.startswith("VARS"): 

597 columns = line.strip().split()[1:] 

598 elif line[:5].strip(" ").isdigit(): 

599 break 

600 else: 

601 to_skip += 1 

602 df = pd.read_csv( 

603 self.peaklist_path, skiprows=to_skip, names=columns, sep=r"\s+" 

604 ) 

605 return df 

606 

607 def _read_csv(self): 

608 """ Read a csv file containing peaklist data  

609  

610 Requires the following columns: 

611 X_PPM: ppm position of peak in X axis 

612 Y_PPM: ppm position of peak in Y axis 

613 ASS: assignment of peak 

614 Optional columns include: 

615 XW_HZ: estimated X axis linewidth in HZ 

616 YW_HZ: estimated Y axis linewidth in HZ 

617 VOL: peak volume 

618 Height: peak height 

619 """ 

620 df = pd.read_csv(self.peaklist_path) 

621 df["INDEX"] = df.index 

622 # need to add LW estimate 

623 if not "XW_HZ" in df.columns: 

624 df["XW_HZ"] = 20.0 

625 if not "YW_HZ" in df.columns: 

626 df["YW_HZ"] = 20.0 

627 # dummy values 

628 if not "HEIGHT" in df.columns: 

629 df["HEIGHT"] = 0.0 

630 if not "VOL" in df.columns: 

631 df["VOL"] = 0.0 

632 return df 

633 

634 def check_assignments(self): 

635 # self.df["ASS"] = self.df. 

636 self.df["ASS"] = self.df.ASS.astype(object) 

637 self.df.loc[self.df["ASS"].isnull(), "ASS"] = "None_dummy_0" 

638 self.df["ASS"] = self.df.ASS.astype(str) 

639 duplicates_bool = self.df.ASS.duplicated() 

640 duplicates = self.df.ASS[duplicates_bool] 

641 if len(duplicates) > 0: 

642 console.print( 

643 textwrap.dedent( 

644 """ 

645 ############################################################################# 

646 You have duplicated assignments in your list... 

647 Currently each peak needs a unique assignment. Sorry about that buddy... 

648 ############################################################################# 

649 """ 

650 ), 

651 style="yellow", 

652 ) 

653 self.df.loc[duplicates_bool, "ASS"] = [ 

654 f"{i}_dummy_{num+1}" for num, i in enumerate(duplicates) 

655 ] 

656 if self.verbose: 

657 print("Here are the duplicates") 

658 print(duplicates) 

659 print(self.df.ASS) 

660 

661 print( 

662 textwrap.dedent( 

663 """ 

664 Creating dummy assignments for duplicates 

665 

666 """ 

667 ) 

668 ) 

669 

670 def check_peak_bounds(self): 

671 columns_to_print = ["INDEX", "ASS", "X_AXIS", "Y_AXIS", "X_PPM", "Y_PPM"] 

672 # check that peaks are within the bounds of spectrum 

673 within_x = (self.df.X_PPM < self.f2_ppm_max) & (self.df.X_PPM > self.f2_ppm_min) 

674 within_y = (self.df.Y_PPM < self.f1_ppm_max) & (self.df.Y_PPM > self.f1_ppm_min) 

675 self.excluded = self.df[~(within_x & within_y)] 

676 self.df = self.df[within_x & within_y] 

677 if len(self.excluded) > 0: 

678 print( 

679 textwrap.dedent( 

680 f"""[red] 

681 ################################################################################# 

682 

683 Excluding the following peaks as they are not within the spectrum which has shape 

684 

685 {self.data.shape} 

686 [/red]""" 

687 ) 

688 ) 

689 table_to_print = df_to_rich_table( 

690 self.excluded, 

691 title="Excluded", 

692 columns=columns_to_print, 

693 styles=["red" for i in columns_to_print], 

694 ) 

695 print(table_to_print) 

696 print( 

697 "[red]#################################################################################[/red]" 

698 ) 

699 

700 def clusters( 

701 self, 

702 thres=None, 

703 struc_el: StrucEl = StrucEl.disk, 

704 struc_size=(3,), 

705 l_struc=None, 

706 ): 

707 """Find clusters of peaks 

708 

709 :param thres: threshold for positive signals above which clusters are selected. If None then threshold_otsu is used 

710 :type thres: float 

711 

712 :param struc_el: 'square'|'disk'|'rectangle' 

713 structuring element for binary_closing of thresholded data can be square, disc or rectangle 

714 :type struc_el: str 

715 

716 :param struc_size: size/dimensions of structuring element 

717 for square and disk first element of tuple is used (for disk value corresponds to radius) 

718 for rectangle, tuple corresponds to (width,height). 

719 :type struc_size: tuple 

720 

721 

722 """ 

723 peaks = [[y, x] for y, x in zip(self.df.Y_AXIS, self.df.X_AXIS)] 

724 

725 if thres == None: 

726 thres = self.thres 

727 self._thres = abs(threshold_otsu(self.data[0])) 

728 else: 

729 self._thres = thres 

730 

731 # get positive and negative 

732 thresh_data = np.bitwise_or( 

733 self.data[0] < (self._thres * -1.0), self.data[0] > self._thres 

734 ) 

735 

736 match struc_el: 

737 case struc_el.disk: 

738 radius = struc_size[0] 

739 if self.verbose: 

740 print(f"using disk with {radius}") 

741 closed_data = binary_closing(thresh_data, disk(int(radius))) 

742 

743 case struc_el.square: 

744 width = struc_size[0] 

745 if self.verbose: 

746 print(f"using square with {width}") 

747 closed_data = binary_closing(thresh_data, footprint_rectangle((int(width),int(width)))) 

748 

749 case struc_el.rectangle: 

750 width, height = struc_size 

751 if self.verbose: 

752 print(f"using rectangle with {width} and {height}") 

753 closed_data = binary_closing( 

754 thresh_data, footprint_rectangle((int(width), int(height))) 

755 ) 

756 

757 case _: 

758 if self.verbose: 

759 print(f"Not using any closing function") 

760 closed_data = thresh_data 

761 

762 labeled_array, num_features = ndimage.label(closed_data, l_struc) 

763 

764 self.df.loc[:, "CLUSTID"] = [labeled_array[i[0], i[1]] for i in peaks] 

765 

766 #  renumber "0" clusters 

767 max_clustid = self.df["CLUSTID"].max() 

768 n_of_zeros = len(self.df[self.df["CLUSTID"] == 0]["CLUSTID"]) 

769 self.df.loc[self.df[self.df["CLUSTID"] == 0].index, "CLUSTID"] = np.arange( 

770 max_clustid + 1, n_of_zeros + max_clustid + 1, dtype=int 

771 ) 

772 

773 # count how many peaks per cluster 

774 for ind, group in self.df.groupby("CLUSTID"): 

775 self.df.loc[group.index, "MEMCNT"] = len(group) 

776 

777 self.df.loc[:, "color"] = self.df.apply( 

778 lambda x: Category20[20][int(x.CLUSTID) % 20] if x.MEMCNT > 1 else "black", 

779 axis=1, 

780 ) 

781 return ClustersResult(labeled_array, num_features, closed_data, peaks) 

782 

783 def mask_method(self, overlap=1.0, l_struc=None): 

784 """connect clusters based on overlap of fitting masks 

785 

786 :param overlap: fraction of mask for which overlaps are calculated 

787 :type overlap: float 

788 

789 :returns ClusterResult: Instance of ClusterResult 

790 :rtype: ClustersResult 

791 """ 

792 # overlap is positive 

793 overlap = abs(overlap) 

794 

795 self._thres = threshold_otsu(self.data[0]) 

796 

797 mask = np.zeros(self.data[0].shape, dtype=bool) 

798 

799 for ind, peak in self.df.iterrows(): 

800 mask += make_mask( 

801 self.data[0], 

802 peak.X_AXISf, 

803 peak.Y_AXISf, 

804 peak.X_RADIUS * overlap, 

805 peak.Y_RADIUS * overlap, 

806 ) 

807 

808 peaks = [[y, x] for y, x in zip(self.df.Y_AXIS, self.df.X_AXIS)] 

809 labeled_array, num_features = ndimage.label(mask, l_struc) 

810 

811 self.df.loc[:, "CLUSTID"] = [labeled_array[i[0], i[1]] for i in peaks] 

812 

813 #  renumber "0" clusters 

814 max_clustid = self.df["CLUSTID"].max() 

815 n_of_zeros = len(self.df[self.df["CLUSTID"] == 0]["CLUSTID"]) 

816 self.df.loc[self.df[self.df["CLUSTID"] == 0].index, "CLUSTID"] = np.arange( 

817 max_clustid + 1, n_of_zeros + max_clustid + 1, dtype=int 

818 ) 

819 

820 # count how many peaks per cluster 

821 for ind, group in self.df.groupby("CLUSTID"): 

822 self.df.loc[group.index, "MEMCNT"] = len(group) 

823 

824 self.df.loc[:, "color"] = self.df.apply( 

825 lambda x: Category20[20][int(x.CLUSTID) % 20] if x.MEMCNT > 1 else "black", 

826 axis=1, 

827 ) 

828 

829 return ClustersResult(labeled_array, num_features, mask, peaks) 

830 

831 def to_fuda(self): 

832 fname = self.peaklist_path.parent / "params.fuda" 

833 with open(self.peaklist_path.parent / "peaks.fuda", "w") as peaks_fuda: 

834 for ass, f1_ppm, f2_ppm in zip(self.df.ASS, self.df.Y_PPM, self.df.X_PPM): 

835 peaks_fuda.write(f"{ass}\t{f1_ppm:.3f}\t{f2_ppm:.3f}\n") 

836 groups = self.df.groupby("CLUSTID") 

837 fuda_params = Path(fname) 

838 overlap_peaks = "" 

839 

840 for ind, group in groups: 

841 if len(group) > 1: 

842 overlap_peaks_str = ";".join(group.ASS) 

843 overlap_peaks += f"OVERLAP_PEAKS=({overlap_peaks_str})\n" 

844 

845 fuda_file = textwrap.dedent( 

846 f"""\ 

847 

848# Read peaklist and spectrum info 

849PEAKLIST=peaks.fuda 

850SPECFILE={self.data_path} 

851PARAMETERFILE=(bruker;vclist) 

852ZCORR=ncyc 

853NOISE={self.thres} # you'll need to adjust this 

854BASELINE=N 

855VERBOSELEVEL=5 

856PRINTDATA=Y 

857LM=(MAXFEV=250;TOL=1e-5) 

858#Specify the default values. All values are in ppm: 

859DEF_LINEWIDTH_F1={self.f1_radius} 

860DEF_LINEWIDTH_F2={self.f2_radius} 

861DEF_RADIUS_F1={self.f1_radius} 

862DEF_RADIUS_F2={self.f2_radius} 

863SHAPE=GLORE 

864# OVERLAP PEAKS 

865{overlap_peaks}""" 

866 ) 

867 with open(fuda_params, "w") as f: 

868 print(f"Writing FuDA file {fuda_file}") 

869 f.write(fuda_file) 

870 if self.verbose: 

871 print(overlap_peaks) 

872 

873 

874class ClustersResult: 

875 """Class to store results of clusters function""" 

876 

877 def __init__(self, labeled_array, num_features, closed_data, peaks): 

878 self._labeled_array = labeled_array 

879 self._num_features = num_features 

880 self._closed_data = closed_data 

881 self._peaks = peaks 

882 

883 @property 

884 def labeled_array(self): 

885 return self._labeled_array 

886 

887 @property 

888 def num_features(self): 

889 return self._num_features 

890 

891 @property 

892 def closed_data(self): 

893 return self._closed_data 

894 

895 @property 

896 def peaks(self): 

897 return self._peaks 

898 

899 

900class LoadData(Peaklist): 

901 """Load peaklist data from peakipy .csv file output from either peakipy read or edit 

902 

903 read_peaklist is redefined to just read a .csv file 

904 

905 check_data_frame makes sure data frame is in good shape for setting up fits 

906 

907 """ 

908 

909 def read_peaklist(self): 

910 if self.peaklist_path.suffix == ".csv": 

911 self.df = pd.read_csv(self.peaklist_path) # , comment="#") 

912 

913 elif self.peaklist_path.suffix == ".tab": 

914 self.df = pd.read_csv(self.peaklist_path, sep="\t") # comment="#") 

915 

916 else: 

917 self.df = pd.read_pickle(self.peaklist_path) 

918 

919 self._thres = threshold_otsu(self.data[0]) 

920 

921 return self.df 

922 

923 def validate_peaklist(self): 

924 self.df = pd.DataFrame( 

925 [ 

926 PeaklistColumnsWithClusters(**i).model_dump() 

927 for i in self.df.to_dict(orient="records") 

928 ] 

929 ) 

930 return self.df 

931 

932 def check_data_frame(self): 

933 """ 

934 Ensure the data frame has all required columns and add necessary derived columns for fitting. 

935  

936 Returns 

937 ------- 

938 pd.DataFrame 

939 The modified DataFrame after validation. 

940 """ # make diameter columns 

941 if "X_DIAMETER_PPM" in self.df.columns: 

942 pass 

943 else: 

944 self.df["X_DIAMETER_PPM"] = self.df["X_RADIUS_PPM"] * 2.0 

945 self.df["Y_DIAMETER_PPM"] = self.df["Y_RADIUS_PPM"] * 2.0 

946 

947 #  make a column to track edited peaks 

948 if "Edited" in self.df.columns: 

949 pass 

950 else: 

951 self.df["Edited"] = np.zeros(len(self.df), dtype=bool) 

952 

953 # create include column if it doesn't exist 

954 if "include" in self.df.columns: 

955 pass 

956 else: 

957 self.df["include"] = self.df.apply(lambda _: "yes", axis=1) 

958 

959 # color clusters 

960 self.df["color"] = self.df.apply( 

961 lambda x: Category20[20][int(x.CLUSTID) % 20] if x.MEMCNT > 1 else "black", 

962 axis=1, 

963 ) 

964 

965 # get rid of unnamed columns 

966 unnamed_cols = [i for i in self.df.columns if "Unnamed:" in i] 

967 self.df = self.df.drop(columns=unnamed_cols) 

968 

969 def update_df(self): 

970 """Slightly modified to retain previous configurations""" 

971 # int point value 

972 self.df["X_AXIS"] = self.df.X_PPM.apply(lambda x: self.uc_f2(x, "ppm")) 

973 self.df["Y_AXIS"] = self.df.Y_PPM.apply(lambda x: self.uc_f1(x, "ppm")) 

974 # decimal point value 

975 self.df["X_AXISf"] = self.df.X_PPM.apply(lambda x: self.uc_f2.f(x, "ppm")) 

976 self.df["Y_AXISf"] = self.df.Y_PPM.apply(lambda x: self.uc_f1.f(x, "ppm")) 

977 # in case of missing values (should estimate though) 

978 self.df["XW_HZ"] = self.df.XW_HZ.replace(np.nan, "20.0") 

979 self.df["YW_HZ"] = self.df.YW_HZ.replace(np.nan, "20.0") 

980 # convert linewidths to float 

981 self.df["XW_HZ"] = self.df.XW_HZ.apply(lambda x: float(x)) 

982 self.df["YW_HZ"] = self.df.YW_HZ.apply(lambda x: float(x)) 

983 # convert Hz lw to points 

984 self.df["XW"] = self.df.XW_HZ.apply(lambda x: x * self.pt_per_hz_f2) 

985 self.df["YW"] = self.df.YW_HZ.apply(lambda x: x * self.pt_per_hz_f1) 

986 # makes an assignment column 

987 if self.fmt == "a2": 

988 self.df["ASS"] = self.df.apply( 

989 lambda i: "".join([i["Assign F1"], i["Assign F2"]]), axis=1 

990 ) 

991 

992 # make default values for X and Y radii for fit masks 

993 # self.df["X_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f2_radius 

994 # self.df["Y_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f1_radius 

995 self.df["X_RADIUS"] = self.df.X_RADIUS_PPM.apply( 

996 lambda x: x * self.pt_per_ppm_f2 

997 ) 

998 self.df["Y_RADIUS"] = self.df.Y_RADIUS_PPM.apply( 

999 lambda x: x * self.pt_per_ppm_f1 

1000 ) 

1001 # add include column 

1002 if "include" in self.df.columns: 

1003 pass 

1004 else: 

1005 self.df["include"] = self.df.apply(lambda x: "yes", axis=1) 

1006 

1007 # check assignments for duplicates 

1008 self.check_assignments() 

1009 # check that peaks are within the bounds of the data 

1010 self.check_peak_bounds() 

1011 self.validate_peaklist() 

1012 

1013 

1014def get_vclist(vclist, args): 

1015 # read vclist 

1016 if vclist is None: 

1017 vclist = False 

1018 elif vclist.exists(): 

1019 vclist_data = np.genfromtxt(vclist) 

1020 args["vclist_data"] = vclist_data 

1021 vclist = True 

1022 else: 

1023 raise Exception("vclist not found...") 

1024 

1025 args["vclist"] = vclist 

1026 return args