Linuxでソフトウェアミラーしてたら怪しげなNVMeストレージがすぐ死んだ

Amazonにて4280円で買ったNVMe SSDが2週間経たずに死んだ

ちなみに買ったやつはコレ

別記事にしているようにミニPCにてNVMe SSDとSATA SSDでミラーするように設定している環境だったので、まだ影響は出ていないが、早すぎでは??

まずは状態確認

cat /proc/mdstat

# cat /proc/mdstat
Personalities : [raid1]
md127 : active raid1 sdc3[1] nvme0n1p3[0](F)
      497876992 blocks super 1.2 [2/1] [_U]
      bitmap: 2/4 pages [8KB], 65536KB chunk

unused devices: <none>
#

mdadmでdatail表示

# mdadm --query /dev/md127
/dev/md127: 474.81GiB raid1 2 devices, 0 spares. Use mdadm --detail for more detail.
# mdadm --detail /dev/md127
/dev/md127:
           Version : 1.2
     Creation Time : Mon Nov 25 22:23:15 2024
        Raid Level : raid1
        Array Size : 497876992 (474.81 GiB 509.83 GB)
     Used Dev Size : 497876992 (474.81 GiB 509.83 GB)
      Raid Devices : 2
     Total Devices : 2
       Persistence : Superblock is persistent

     Intent Bitmap : Internal

       Update Time : Fri Dec  6 11:27:27 2024
             State : clean, degraded
    Active Devices : 1
   Working Devices : 1
    Failed Devices : 1
     Spare Devices : 0

Consistency Policy : bitmap

              Name : niselog.dyndns.ws:pv00  (local to host niselog.dyndns.ws)
              UUID : 44d77e34:c9af4167:1c6031a7:b047cdb0
            Events : 56525

    Number   Major   Minor   RaidDevice State
       -       0        0        0      removed
       1       8       35        1      active sync   /dev/sdc3

       0     259        3        -      faulty   /dev/nvme0n1p3
#

mdを構成する各デバイスの状態をmdadm –examineで取得

# mdadm --examine /dev/sdc3
/dev/sdc3:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x1
     Array UUID : 44d77e34:c9af4167:1c6031a7:b047cdb0
           Name : niselog.dyndns.ws:pv00  (local to host niselog.dyndns.ws)
  Creation Time : Mon Nov 25 22:23:15 2024
     Raid Level : raid1
   Raid Devices : 2

 Avail Dev Size : 995753984 sectors (474.81 GiB 509.83 GB)
     Array Size : 497876992 KiB (474.81 GiB 509.83 GB)
    Data Offset : 264192 sectors
   Super Offset : 8 sectors
   Unused Space : before=264112 sectors, after=0 sectors
          State : clean
    Device UUID : 622cd160:74e95f66:6266ee0d:85ba3287

Internal Bitmap : 8 sectors from superblock
    Update Time : Fri Dec  6 11:29:02 2024
  Bad Block Log : 512 entries available at offset 16 sectors
       Checksum : 247ea644 - correct
         Events : 56583


   Device Role : Active device 1
   Array State : .A ('A' == active, '.' == missing, 'R' == replacing)
# mdadm --examine /dev/nvme0n1p3
mdadm: No md superblock detected on /dev/nvme0n1p3.
#

NVMe側のデバイスが見えていない

関連するdmesg

[251879.751800] systemd-rc-local-generator[882428]: /etc/rc.d/rc.local is not marked executable, skipping.
[345055.452619] nvme nvme0: I/O tag 322 (0142) opcode 0x0 (Flush) QID 4 timeout, aborting req_op:FLUSH(2) size:0
[345057.437597] nvme nvme0: I/O tag 210 (a0d2) opcode 0x2 (Read) QID 2 timeout, aborting req_op:READ(0) size:32768
[345057.437643] nvme nvme0: I/O tag 706 (c2c2) opcode 0x2 (Read) QID 3 timeout, aborting req_op:READ(0) size:32768
[345085.664306] nvme nvme0: I/O tag 322 (0142) opcode 0x0 (Flush) QID 4 timeout, reset controller
[345167.062438] INFO: task md127_raid1:603 blocked for more than 122 seconds.
[345167.062449]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.062452] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.062454] task:md127_raid1     state:D stack:0     pid:603   tgid:603   ppid:2      flags:0x00004000
[345167.062460] Call Trace:
[345167.062462]  <TASK>
[345167.062466]  __schedule+0x229/0x550
[345167.062473]  ? __schedule+0x231/0x550
[345167.062477]  schedule+0x2e/0xd0
[345167.062480]  md_super_wait+0x72/0xa0
[345167.062484]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.062489]  write_sb_page+0x8a/0x110
[345167.062492]  md_update_sb.part.0+0x2eb/0x800
[345167.062494]  md_check_recovery+0x232/0x390
[345167.062500]  raid1d+0x40/0x580 [raid1]
[345167.062508]  ? __timer_delete_sync+0x2c/0x40
[345167.062511]  ? schedule_timeout+0x92/0x160
[345167.062514]  ? prepare_to_wait_event+0x5d/0x180
[345167.062517]  md_thread+0xa8/0x160
[345167.062520]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.062523]  ? __pfx_md_thread+0x10/0x10
[345167.062525]  kthread+0xdd/0x100
[345167.062529]  ? __pfx_kthread+0x10/0x10
[345167.062532]  ret_from_fork+0x29/0x50
[345167.062536]  </TASK>
[345167.062539] INFO: task xfsaild/dm-0:715 blocked for more than 122 seconds.
[345167.062542]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.062544] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.062546] task:xfsaild/dm-0    state:D stack:0     pid:715   tgid:715   ppid:2      flags:0x00004000
[345167.062550] Call Trace:
[345167.062552]  <TASK>
[345167.062553]  __schedule+0x229/0x550
[345167.062556]  ? bio_associate_blkg_from_css+0xf5/0x320
[345167.062561]  schedule+0x2e/0xd0
[345167.062564]  md_write_start.part.0+0x195/0x250
[345167.062566]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.062570]  raid1_make_request+0x5b/0xbb [raid1]
[345167.062575]  md_handle_request+0x150/0x270
[345167.062578]  ? __bio_split_to_limits+0x8e/0x280
[345167.062582]  __submit_bio+0x94/0x130
[345167.062584]  __submit_bio_noacct+0x7e/0x1e0
[345167.062587]  xfs_buf_ioapply_map+0x1cb/0x270 [xfs]
[345167.062725]  _xfs_buf_ioapply+0xcf/0x1b0 [xfs]
[345167.062821]  ? __pfx_default_wake_function+0x10/0x10
[345167.062824]  __xfs_buf_submit+0x6e/0x1e0 [xfs]
[345167.062916]  xfs_buf_delwri_submit_buffers+0xe3/0x230 [xfs]
[345167.063005]  xfsaild_push+0x1aa/0x740 [xfs]
[345167.063122]  xfsaild+0xb2/0x150 [xfs]
[345167.063230]  ? __pfx_xfsaild+0x10/0x10 [xfs]
[345167.063333]  kthread+0xdd/0x100
[345167.063336]  ? __pfx_kthread+0x10/0x10
[345167.063339]  ret_from_fork+0x29/0x50
[345167.063342]  </TASK>
[345167.063353] INFO: task xfsaild/dm-12:1051 blocked for more than 122 seconds.
[345167.063356]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.063358] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.063360] task:xfsaild/dm-12   state:D stack:0     pid:1051  tgid:1051  ppid:2      flags:0x00004000
[345167.063364] Call Trace:
[345167.063365]  <TASK>
[345167.063366]  __schedule+0x229/0x550
[345167.063369]  ? bio_associate_blkg_from_css+0xf5/0x320
[345167.063373]  schedule+0x2e/0xd0
[345167.063376]  md_write_start.part.0+0x195/0x250
[345167.063378]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.063382]  raid1_make_request+0x5b/0xbb [raid1]
[345167.063387]  md_handle_request+0x150/0x270
[345167.063390]  ? __bio_split_to_limits+0x8e/0x280
[345167.063393]  __submit_bio+0x94/0x130
[345167.063395]  __submit_bio_noacct+0x7e/0x1e0
[345167.063397]  xfs_buf_ioapply_map+0x1cb/0x270 [xfs]
[345167.063503]  _xfs_buf_ioapply+0xcf/0x1b0 [xfs]
[345167.063598]  ? __pfx_default_wake_function+0x10/0x10
[345167.063602]  __xfs_buf_submit+0x6e/0x1e0 [xfs]
[345167.063693]  xfs_buf_delwri_submit_buffers+0xe3/0x230 [xfs]
[345167.063783]  xfsaild_push+0x1aa/0x740 [xfs]
[345167.063893]  xfsaild+0xb2/0x150 [xfs]
[345167.063996]  ? __pfx_xfsaild+0x10/0x10 [xfs]
[345167.064096]  kthread+0xdd/0x100
[345167.064099]  ? __pfx_kthread+0x10/0x10
[345167.064102]  ret_from_fork+0x29/0x50
[345167.064105]  </TASK>
[345167.064149] INFO: task UV_WORKER[13]:882664 blocked for more than 122 seconds.
[345167.064152]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.064154] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.064156] task:UV_WORKER[13]   state:D stack:0     pid:882664 tgid:882471 ppid:1      flags:0x00000002
[345167.064160] Call Trace:
[345167.064161]  <TASK>
[345167.064163]  __schedule+0x229/0x550
[345167.064166]  ? bio_associate_blkg_from_css+0xf5/0x320
[345167.064170]  schedule+0x2e/0xd0
[345167.064172]  md_write_start.part.0+0x195/0x250
[345167.064175]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.064178]  raid1_make_request+0x5b/0xbb [raid1]
[345167.064184]  md_handle_request+0x150/0x270
[345167.064187]  ? __bio_split_to_limits+0x8e/0x280
[345167.064190]  __submit_bio+0x94/0x130
[345167.064192]  __submit_bio_noacct+0x7e/0x1e0
[345167.064194]  iomap_submit_ioend+0x4e/0x80
[345167.064199]  xfs_vm_writepages+0x7a/0xb0 [xfs]
[345167.064305]  do_writepages+0xcc/0x1a0
[345167.064308]  filemap_fdatawrite_wbc+0x66/0x90
[345167.064312]  __filemap_fdatawrite_range+0x54/0x80
[345167.064317]  file_write_and_wait_range+0x48/0xb0
[345167.064319]  xfs_file_fsync+0x5a/0x240 [xfs]
[345167.064425]  __x64_sys_fsync+0x33/0x60
[345167.064430]  do_syscall_64+0x5c/0xf0
[345167.064433]  ? fcntl_setlk+0x1cb/0x3b0
[345167.064437]  ? do_fcntl+0x458/0x670
[345167.064440]  ? syscall_exit_work+0x103/0x130
[345167.064443]  ? syscall_exit_to_user_mode+0x19/0x40
[345167.064446]  ? do_syscall_64+0x6b/0xf0
[345167.064448]  ? __count_memcg_events+0x4f/0xb0
[345167.064451]  ? mm_account_fault+0x6c/0x100
[345167.064455]  ? handle_mm_fault+0x116/0x270
[345167.064458]  ? do_user_addr_fault+0x1b4/0x6a0
[345167.064461]  ? exc_page_fault+0x62/0x150
[345167.064465]  entry_SYSCALL_64_after_hwframe+0x78/0x80
[345167.064468] RIP: 0033:0x7f36adb0459b
[345167.064496] RSP: 002b:00007f36a0ce4c20 EFLAGS: 00000293 ORIG_RAX: 000000000000004a
[345167.064500] RAX: ffffffffffffffda RBX: 0000563b7f63af38 RCX: 00007f36adb0459b
[345167.064502] RDX: 0000000000000002 RSI: 0000000000000002 RDI: 000000000000000d
[345167.064504] RBP: 0000000000000008 R08: 0000000000000000 R09: 0000000000000000
[345167.064506] R10: 0000000000000000 R11: 0000000000000293 R12: 0000563b7f63aea8
[345167.064508] R13: 0000563b82320850 R14: 0000000000000000 R15: 00007f36a0ce4ce0
[345167.064512]  </TASK>
[345167.064562] INFO: task kworker/u16:2:1205595 blocked for more than 122 seconds.
[345167.064565]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.064567] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.064569] task:kworker/u16:2   state:D stack:0     pid:1205595 tgid:1205595 ppid:2      flags:0x00004000
[345167.064574] Workqueue: writeback wb_workfn (flush-253:6)
[345167.064578] Call Trace:
[345167.064579]  <TASK>
[345167.064581]  __schedule+0x229/0x550
[345167.064584]  ? bio_associate_blkg_from_css+0xf5/0x320
[345167.064587]  schedule+0x2e/0xd0
[345167.064590]  md_write_start.part.0+0x195/0x250
[345167.064593]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.064596]  raid1_make_request+0x5b/0xbb [raid1]
[345167.064602]  md_handle_request+0x150/0x270
[345167.064605]  ? __bio_split_to_limits+0x8e/0x280
[345167.064608]  __submit_bio+0x94/0x130
[345167.064610]  __submit_bio_noacct+0x7e/0x1e0
[345167.064612]  iomap_submit_ioend+0x4e/0x80
[345167.064616]  iomap_writepage_map+0x30a/0x4c0
[345167.064618]  write_cache_pages+0x13c/0x3a0
[345167.064620]  ? __pfx_iomap_do_writepage+0x10/0x10
[345167.064623]  ? scsi_dispatch_cmd+0x8d/0x240
[345167.064626]  ? scsi_queue_rq+0x1ad/0x610
[345167.064631]  ? update_sg_lb_stats+0xb6/0x460
[345167.064635]  iomap_writepages+0x1c/0x40
[345167.064638]  xfs_vm_writepages+0x7a/0xb0 [xfs]
[345167.064739]  do_writepages+0xcc/0x1a0
[345167.064742]  ? __percpu_counter_sum_mask+0x6f/0x80
[345167.064747]  __writeback_single_inode+0x41/0x270
[345167.064750]  writeback_sb_inodes+0x209/0x4a0
[345167.064753]  __writeback_inodes_wb+0x4c/0xe0
[345167.064755]  wb_writeback+0x1d7/0x2d0
[345167.064758]  wb_do_writeback+0x1d1/0x2b0
[345167.064760]  wb_workfn+0x5e/0x290
[345167.064763]  ? try_to_wake_up+0x1ca/0x530
[345167.064766]  process_one_work+0x194/0x380
[345167.064769]  worker_thread+0x2fe/0x410
[345167.064772]  ? __pfx_worker_thread+0x10/0x10
[345167.064775]  kthread+0xdd/0x100
[345167.064778]  ? __pfx_kthread+0x10/0x10
[345167.064781]  ret_from_fork+0x29/0x50
[345167.064784]  </TASK>
[345167.064786] INFO: task kworker/u16:0:1209123 blocked for more than 122 seconds.
[345167.064788]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.064790] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.064792] task:kworker/u16:0   state:D stack:0     pid:1209123 tgid:1209123 ppid:2      flags:0x00004000
[345167.064796] Workqueue: writeback wb_workfn (flush-253:6)
[345167.064799] Call Trace:
[345167.064801]  <TASK>
[345167.064802]  __schedule+0x229/0x550
[345167.064805]  ? bio_associate_blkg_from_css+0xf5/0x320
[345167.064808]  schedule+0x2e/0xd0
[345167.064811]  md_write_start.part.0+0x195/0x250
[345167.064813]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.064817]  raid1_make_request+0x5b/0xbb [raid1]
[345167.064822]  md_handle_request+0x150/0x270
[345167.064825]  ? __bio_split_to_limits+0x8e/0x280
[345167.064828]  __submit_bio+0x94/0x130
[345167.064830]  __submit_bio_noacct+0x7e/0x1e0
[345167.064832]  iomap_submit_ioend+0x4e/0x80
[345167.064835]  iomap_writepage_map+0x30a/0x4c0
[345167.064838]  write_cache_pages+0x13c/0x3a0
[345167.064840]  ? __pfx_iomap_do_writepage+0x10/0x10
[345167.064843]  ? scsi_dispatch_cmd+0x8d/0x240
[345167.064845]  ? scsi_queue_rq+0x1ad/0x610
[345167.064848]  ? update_sg_lb_stats+0xb6/0x460
[345167.064851]  iomap_writepages+0x1c/0x40
[345167.064854]  xfs_vm_writepages+0x7a/0xb0 [xfs]
[345167.064949]  do_writepages+0xcc/0x1a0
[345167.064952]  ? __percpu_counter_sum_mask+0x6f/0x80
[345167.064955]  __writeback_single_inode+0x41/0x270
[345167.064958]  writeback_sb_inodes+0x209/0x4a0
[345167.064961]  __writeback_inodes_wb+0x4c/0xe0
[345167.064963]  wb_writeback+0x1d7/0x2d0
[345167.064965]  wb_do_writeback+0x1d1/0x2b0
[345167.064968]  wb_workfn+0x5e/0x290
[345167.064970]  ? __switch_to_asm+0x3a/0x80
[345167.064972]  ? finish_task_switch.isra.0+0x8c/0x2a0
[345167.064976]  ? __schedule+0x231/0x550
[345167.064979]  process_one_work+0x194/0x380
[345167.064982]  worker_thread+0x2fe/0x410
[345167.064985]  ? __pfx_worker_thread+0x10/0x10
[345167.064987]  kthread+0xdd/0x100
[345167.064990]  ? __pfx_kthread+0x10/0x10
[345167.064994]  ret_from_fork+0x29/0x50
[345167.064996]  </TASK>
[345167.064999] INFO: task kworker/u16:4:1216782 blocked for more than 122 seconds.
[345167.065001]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.065004] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.065005] task:kworker/u16:4   state:D stack:0     pid:1216782 tgid:1216782 ppid:2      flags:0x00004000
[345167.065009] Workqueue: writeback wb_workfn (flush-253:6)
[345167.065012] Call Trace:
[345167.065014]  <TASK>
[345167.065015]  __schedule+0x229/0x550
[345167.065018]  ? bio_associate_blkg_from_css+0xf5/0x320
[345167.065021]  schedule+0x2e/0xd0
[345167.065024]  md_write_start.part.0+0x195/0x250
[345167.065026]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.065030]  raid1_make_request+0x5b/0xbb [raid1]
[345167.065035]  md_handle_request+0x150/0x270
[345167.065038]  ? __bio_split_to_limits+0x8e/0x280
[345167.065041]  __submit_bio+0x94/0x130
[345167.065043]  __submit_bio_noacct+0x7e/0x1e0
[345167.065045]  iomap_submit_ioend+0x4e/0x80
[345167.065048]  xfs_vm_writepages+0x7a/0xb0 [xfs]
[345167.065140]  do_writepages+0xcc/0x1a0
[345167.065143]  ? __wb_calc_thresh+0x3a/0x120
[345167.065145]  __writeback_single_inode+0x41/0x270
[345167.065147]  writeback_sb_inodes+0x209/0x4a0
[345167.065150]  __writeback_inodes_wb+0x4c/0xe0
[345167.065153]  wb_writeback+0x1d7/0x2d0
[345167.065155]  wb_do_writeback+0x22a/0x2b0
[345167.065157]  wb_workfn+0x5e/0x290
[345167.065160]  ? try_to_wake_up+0x1ca/0x530
[345167.065163]  process_one_work+0x194/0x380
[345167.065166]  worker_thread+0x2fe/0x410
[345167.065168]  ? __pfx_worker_thread+0x10/0x10
[345167.065171]  kthread+0xdd/0x100
[345167.065174]  ? __pfx_kthread+0x10/0x10
[345167.065177]  ret_from_fork+0x29/0x50
[345167.065180]  </TASK>
[345167.065181] INFO: task kworker/1:0:1217700 blocked for more than 122 seconds.
[345167.065184]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.065186] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.065188] task:kworker/1:0     state:D stack:0     pid:1217700 tgid:1217700 ppid:2      flags:0x00004000
[345167.065192] Workqueue: xfs-sync/dm-4 xfs_log_worker [xfs]
[345167.065302] Call Trace:
[345167.065304]  <TASK>
[345167.065305]  __schedule+0x229/0x550
[345167.065309]  ? __send_empty_flush+0xea/0x120 [dm_mod]
[345167.065324]  schedule+0x2e/0xd0
[345167.065327]  md_flush_request+0x9b/0x1e0
[345167.065331]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.065335]  raid1_make_request+0xa8/0xbb [raid1]
[345167.065340]  md_handle_request+0x150/0x270
[345167.065343]  ? __bio_split_to_limits+0x8e/0x280
[345167.065346]  __submit_bio+0x94/0x130
[345167.065348]  __submit_bio_noacct+0x7e/0x1e0
[345167.065350]  xlog_state_release_iclog+0xe6/0x1c0 [xfs]
[345167.065464]  xfs_log_force+0x172/0x230 [xfs]
[345167.065566]  xfs_log_worker+0x3b/0xd0 [xfs]
[345167.065664]  process_one_work+0x194/0x380
[345167.065667]  worker_thread+0x2fe/0x410
[345167.065669]  ? __pfx_worker_thread+0x10/0x10
[345167.065672]  kthread+0xdd/0x100
[345167.065675]  ? __pfx_kthread+0x10/0x10
[345167.065678]  ret_from_fork+0x29/0x50
[345167.065681]  </TASK>
[345167.065683] INFO: task kworker/0:2:1219498 blocked for more than 122 seconds.
[345167.065685]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.065687] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.065689] task:kworker/0:2     state:D stack:0     pid:1219498 tgid:1219498 ppid:2      flags:0x00004000
[345167.065693] Workqueue: xfs-sync/dm-6 xfs_log_worker [xfs]
[345167.065790] Call Trace:
[345167.065791]  <TASK>
[345167.065793]  __schedule+0x229/0x550
[345167.065796]  ? __send_empty_flush+0xea/0x120 [dm_mod]
[345167.065810]  schedule+0x2e/0xd0
[345167.065812]  md_flush_request+0x9b/0x1e0
[345167.065816]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.065819]  raid1_make_request+0xa8/0xbb [raid1]
[345167.065825]  md_handle_request+0x150/0x270
[345167.065827]  ? __bio_split_to_limits+0x8e/0x280
[345167.065830]  __submit_bio+0x94/0x130
[345167.065832]  __submit_bio_noacct+0x7e/0x1e0
[345167.065835]  xlog_state_release_iclog+0xe6/0x1c0 [xfs]
[345167.065931]  xfs_log_force+0x172/0x230 [xfs]
[345167.066027]  xfs_log_worker+0x3b/0xd0 [xfs]
[345167.066122]  process_one_work+0x194/0x380
[345167.066125]  worker_thread+0x2fe/0x410
[345167.066128]  ? __pfx_worker_thread+0x10/0x10
[345167.066131]  kthread+0xdd/0x100
[345167.066134]  ? __pfx_kthread+0x10/0x10
[345167.066137]  ret_from_fork+0x29/0x50
[345167.066140]  </TASK>
[345167.066141] INFO: task kworker/u16:1:1220633 blocked for more than 122 seconds.
[345167.066144]       Tainted: G               X  -------  ---  5.14.0-503.14.1.el9_5.x86_64 #1
[345167.066146] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[345167.066148] task:kworker/u16:1   state:D stack:0     pid:1220633 tgid:1220633 ppid:2      flags:0x00004000
[345167.066152] Workqueue: writeback wb_workfn (flush-253:6)
[345167.066155] Call Trace:
[345167.066157]  <TASK>
[345167.066158]  __schedule+0x229/0x550
[345167.066162]  schedule+0x2e/0xd0
[345167.066165]  md_write_start.part.0+0x195/0x250
[345167.066167]  ? __pfx_autoremove_wake_function+0x10/0x10
[345167.066171]  raid1_make_request+0x5b/0xbb [raid1]
[345167.066177]  md_handle_request+0x150/0x270
[345167.066179]  ? __bio_split_to_limits+0x8e/0x280
[345167.066182]  __submit_bio+0x94/0x130
[345167.066185]  __submit_bio_noacct+0x7e/0x1e0
[345167.066187]  iomap_submit_ioend+0x4e/0x80
[345167.066191]  xfs_vm_writepages+0x7a/0xb0 [xfs]
[345167.066299]  do_writepages+0xcc/0x1a0
[345167.066301]  ? find_busiest_group+0x43/0x240
[345167.066304]  __writeback_single_inode+0x41/0x270
[345167.066306]  writeback_sb_inodes+0x209/0x4a0
[345167.066309]  __writeback_inodes_wb+0x4c/0xe0
[345167.066312]  wb_writeback+0x1d7/0x2d0
[345167.066314]  wb_do_writeback+0x1d1/0x2b0
[345167.066317]  wb_workfn+0x5e/0x290
[345167.066319]  ? try_to_wake_up+0x1ca/0x530
[345167.066322]  process_one_work+0x194/0x380
[345167.066325]  worker_thread+0x2fe/0x410
[345167.066328]  ? __pfx_worker_thread+0x10/0x10
[345167.066330]  kthread+0xdd/0x100
[345167.066333]  ? __pfx_kthread+0x10/0x10
[345167.066336]  ret_from_fork+0x29/0x50
[345167.066339]  </TASK>
[345274.582484] nvme nvme0: Device not ready; aborting reset, CSTS=0x1
[345274.588547] nvme nvme0: Abort status: 0x371
[345274.588554] nvme nvme0: Abort status: 0x371
[345274.588556] nvme nvme0: Abort status: 0x371
[345402.595930] nvme nvme0: Device not ready; aborting reset, CSTS=0x1
[345402.596168] nvme nvme0: Disabling device after reset failure: -19
[345402.603001] I/O error, dev nvme0n1, sector 31757592 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603001] I/O error, dev nvme0n1, sector 31745656 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603005] I/O error, dev nvme0n1, sector 4196368 op 0x1:(WRITE) flags 0x29800 phys_seg 1 prio class 2
[345402.603011] md: super_written gets error=-5
[345402.603011] md/raid1:md127: nvme0n1p3: rescheduling sector 27297048
[345402.603017] I/O error, dev nvme0n1, sector 0 op 0x1:(WRITE) flags 0x800 phys_seg 0 prio class 2
[345402.603018] md/raid1:md127: nvme0n1p3: rescheduling sector 27285112
[345402.603021] md/raid1:md127: Disk failure on nvme0n1p3, disabling device.
                md/raid1:md127: Operation continuing on 1 devices.
[345402.603021] I/O error, dev nvme0n1, sector 31835944 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603024] md/raid1:md127: nvme0n1p3: rescheduling sector 27375400
[345402.603025] I/O error, dev nvme0n1, sector 31772336 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603027] md/raid1:md127: nvme0n1p3: rescheduling sector 27311792
[345402.603037] I/O error, dev nvme0n1, sector 31790576 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603040] md/raid1:md127: nvme0n1p3: rescheduling sector 27330032
[345402.603066] I/O error, dev nvme0n1, sector 31750480 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603071] md/raid1:md127: nvme0n1p3: rescheduling sector 27289936
[345402.603073] I/O error, dev nvme0n1, sector 31831344 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603076] md/raid1:md127: nvme0n1p3: rescheduling sector 27370800
[345402.603100] nvme nvme0: Identify namespace failed (-5)
[345402.606121] md/raid1:md127: redirecting sector 27297048 to other mirror: sdc3
[345402.616231] md/raid1:md127: redirecting sector 27285112 to other mirror: sdc3
[345402.618772] md/raid1:md127: redirecting sector 27375400 to other mirror: sdc3
[345402.620045] md/raid1:md127: redirecting sector 27311792 to other mirror: sdc3
[345402.621385] md/raid1:md127: redirecting sector 27330032 to other mirror: sdc3
[345402.623214] md/raid1:md127: redirecting sector 27289936 to other mirror: sdc3
[345402.625367] md/raid1:md127: redirecting sector 27370800 to other mirror: sdc3
[345415.911236] nvme nvme0: Identify namespace failed (-5)
[346065.904105] nvme nvme0: Identify namespace failed (-5)
[346705.897901] nvme nvme0: Identify namespace failed (-5)
[347330.890137] nvme nvme0: Identify namespace failed (-5)
[348045.882527] nvme nvme0: Identify namespace failed (-5)
[348825.874978] nvme nvme0: Identify namespace failed (-5)
[349535.866785] nvme nvme0: Identify namespace failed (-5)
[350350.858851] nvme nvme0: Identify namespace failed (-5)
[351205.849071] nvme nvme0: Identify namespace failed (-5)
[351985.841745] nvme nvme0: Identify namespace failed (-5)
[352775.833593] nvme nvme0: Identify namespace failed (-5)
[353565.825575] nvme nvme0: Identify namespace failed (-5)
[354185.819012] nvme nvme0: Identify namespace failed (-5)
[354805.812068] nvme nvme0: Identify namespace failed (-5)
[355735.801917] nvme nvme0: Identify namespace failed (-5)
[356405.795685] nvme nvme0: Identify namespace failed (-5)
[357365.784744] nvme nvme0: Identify namespace failed (-5)
[358085.778398] nvme nvme0: Identify namespace failed (-5)
[358915.770064] nvme nvme0: Identify namespace failed (-5)
[359685.761817] nvme nvme0: Identify namespace failed (-5)
[360535.752860] nvme nvme0: Identify namespace failed (-5)
[361355.743738] nvme nvme0: Identify namespace failed (-5)
[362375.733015] nvme nvme0: Identify namespace failed (-5)
[363245.724684] nvme nvme0: Identify namespace failed (-5)
[364125.714801] nvme nvme0: Identify namespace failed (-5)
[365045.706093] nvme nvme0: Identify namespace failed (-5)
[365860.696897] nvme nvme0: Identify namespace failed (-5)
[366830.687532] nvme nvme0: Identify namespace failed (-5)
[367800.677730] nvme nvme0: Identify namespace failed (-5)
[368675.667759] nvme nvme0: Identify namespace failed (-5)
[369695.658067] nvme nvme0: Identify namespace failed (-5)
[370655.647552] nvme nvme0: Identify namespace failed (-5)
[371725.636876] nvme nvme0: Identify namespace failed (-5)
[372795.625832] nvme nvme0: Identify namespace failed (-5)
[373405.619870] nvme nvme0: Identify namespace failed (-5)
[374525.607754] nvme nvme0: Identify namespace failed (-5)
[375320.600472] nvme nvme0: Identify namespace failed (-5)
[376490.587461] nvme nvme0: Identify namespace failed (-5)
[377660.575315] nvme nvme0: Identify namespace failed (-5)
[378765.564104] nvme nvme0: Identify namespace failed (-5)
[379375.558613] nvme nvme0: Identify namespace failed (-5)
[379985.552536] nvme nvme0: Identify namespace failed (-5)
[380595.546287] nvme nvme0: Identify namespace failed (-5)
[380894.663810] systemd-rc-local-generator[1347729]: /etc/rc.d/rc.local is not marked executable, skipping.
[380902.636127] nvme nvme0: Identify namespace failed (-5)
[469038.217996] systemd-rc-local-generator[1658780]: /etc/rc.d/rc.local is not marked executable, skipping.
[469041.391405] nvme nvme0: Identify namespace failed (-5)

うーん・・・

代替のSSDをどうするか悩みどころ・・・

TBWの値はどうなってるか確認しつつ選定かな

CRUCIAL P1 (1900MB/950MB)
CRUCIAL P3 PLUS SSD 512GB 500TBW (5000MB/4200MB)
CRUCIAL T500 SSD 500GB 300TBW
Crucial P310 500GB 110TBW
Crucial P3 500GB 110TBW
Lexor LNM620X512G-RNNNG 512GB 250TBW
fanxiang S500 Pro 500GB 320TBW (3500MB/2700MB)
fanxiang S501Q 512GB 160TBW (3600MB/2700MB) ← 今回壊れたやつ
fanxiang S660 500GB 350TBW (4600MB/2650MB)
fanxiang S880E 500GB 300TBW (6300MB/3100MB)
Fikwot FN960 512GB 350TBW (7400MB/2750MB)
Fikwot FX991 500GB 300TBW (6300MB/3100MB)
Samsung 980 500GB 300TBW
Ediloca EN600 PRO 500GB 320TBW (3200MB/2800MB)
EDILOCA EN605 500GB 300TBW (2150MB/1600MB)
Ediloca EN760 500GB 350TBW (4800MB/2650MB)
Ediloca EN855 500GB 350TBW (7400MB/2750MB)
WD Blue SN580 500GB 300TBW
ADATA LEGEND 800シリーズ 500GB 300TBW
Acclamator N20 500GB 250TBW (2500MB/2000MB)
Acclamator N30 500GB 300TBW (3500MB/3000MB)
ORICO J10 512GB 150TBW (2800MB/1300MB)



NVMeの状態を見れる「nvme」コマンドってあったな、とarchlinuxの「ソリッドステートドライブ/NVMe」を見ながらコマンドを入れてみる

現状、「nvme list」ではデバイスは出てこない

[root@niselog ~]# nvme list
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
[root@niselog ~]#

エラーログを見れるか「nvme error-log」を実行してみるがデバイスが見えないのでダメっぽい

[root@niselog ~]# nvme error-log  /dev/nvme0n1
identify controller: Input/output error
[root@niselog ~]#

リセットも同様にダメ

[root@niselog ~]# nvme reset /dev/nvme0n1
Reset: Block device required
[root@niselog ~]#

じゃあ、再検索かな?と「nvme discover」を実行したところ、再認識に成功

[root@niselog ~]# nvme discover
[root@niselog ~]# nvme list
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
/dev/nvme0n1          /dev/ng0n1            FXS501Q244110889     Fanxiang S501Q 512GB                     0x1        512.11  GB /   0.00   B    512   B +  0 B   SN22751
[root@niselog ~]#

ん????

[root@niselog ~]# nvme error-log  /dev/nvme0n1
identify controller: Input/output error
[root@niselog ~]# nvme list
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
[root@niselog ~]#

即オフラインになっていた

dmesg上は特になし

[518981.064372] nvme nvme0: Identify namespace failed (-5)
[519070.106359] nvme nvme0: Identify namespace failed (-5)
[519106.607320] nvme nvme0: Identify namespace failed (-5)
[519392.028895] nvme nvme0: Identify namespace failed (-5)
[519430.063154] nvme nvme0: Identify namespace failed (-5)
[519439.241555] nvme nvme0: Identify namespace failed (-5)

だめっぽい?

で、archlinuxのページの下の方にある「APST サポートの問題によるコントローラの機能不全」に似たようなログが出ている

[345055.452619] nvme nvme0: I/O tag 322 (0142) opcode 0x0 (Flush) QID 4 timeout, aborting req_op:FLUSH(2) size:0
[345057.437597] nvme nvme0: I/O tag 210 (a0d2) opcode 0x2 (Read) QID 2 timeout, aborting req_op:READ(0) size:32768
[345057.437643] nvme nvme0: I/O tag 706 (c2c2) opcode 0x2 (Read) QID 3 timeout, aborting req_op:READ(0) size:32768
[345085.664306] nvme nvme0: I/O tag 322 (0142) opcode 0x0 (Flush) QID 4 timeout, reset controller
[345274.582484] nvme nvme0: Device not ready; aborting reset, CSTS=0x1
[345274.588547] nvme nvme0: Abort status: 0x371
[345274.588554] nvme nvme0: Abort status: 0x371
[345274.588556] nvme nvme0: Abort status: 0x371
[345402.595930] nvme nvme0: Device not ready; aborting reset, CSTS=0x1
[345402.596168] nvme nvme0: Disabling device after reset failure: -19
[345402.603001] I/O error, dev nvme0n1, sector 31757592 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603001] I/O error, dev nvme0n1, sector 31745656 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 2
[345402.603005] I/O error, dev nvme0n1, sector 4196368 op 0x1:(WRITE) flags 0x29800 phys_seg 1 prio class 2
[345402.603011] md/raid1:md127: nvme0n1p3: rescheduling sector 27297048
[345402.603017] I/O error, dev nvme0n1, sector 0 op 0x1:(WRITE) flags 0x800 phys_seg 0 prio class 2
[345402.603018] md/raid1:md127: nvme0n1p3: rescheduling sector 27285112
[345402.603021] md/raid1:md127: Disk failure on nvme0n1p3, disabling device.

ただ、いまのkenrel は 5.14.0-503.14.1.el9_5.x86_64 なので、これは対策されてるはずの問題のはず

とはいえ、現状の値がどうなってるかを確認してみる

[root@niselog sys]# find /sys -print|grep nvme|grep latency
/sys/devices/pci0000:00/0000:00:1c.0/0000:01:00.0/nvme/nvme0/power/pm_qos_latency_tolerance_us
/sys/module/nvme_core/parameters/apst_primary_latency_tol_us
/sys/module/nvme_core/parameters/apst_secondary_latency_tol_us
/sys/module/nvme_core/parameters/default_ps_max_latency_us
[root@niselog sys]# cat /sys/module/nvme_core/parameters/apst_primary_latency_tol_us
15000
[root@niselog sys]# cat /sys/module/nvme_core/parameters/apst_secondary_latency_tol_us
100000
[root@niselog sys]# cat /sys/module/nvme_core/parameters/default_ps_max_latency_us
100000
[root@niselog sys]# cat /sys/devices/pci0000:00/0000:00:1c.0/0000:01:00.0/nvme/nvme0/power/pm_qos_latency_tolerance_us
100000
[root@niselog sys]#

とりあえず値を0にしてみる

[root@niselog sys]# echo 0 > /sys/module/nvme_core/parameters/default_ps_max_latency_us
[root@niselog sys]# cat /sys/module/nvme_core/parameters/default_ps_max_latency_us
0
[root@niselog sys]#

やっぱりすぐ消えるな

[root@niselog sys]# nvme list
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
[root@niselog sys]# nvme discover
[root@niselog sys]# nvme list
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
/dev/nvme0n1          /dev/ng0n1            FXS501Q244110889     Fanxiang S501Q 512GB                     0x1        512.11  GB /   0.00   B    512   B +  0 B   SN22751
[root@niselog sys]# nvme list
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
[root@niselog sys]#

コメントを残す

メールアドレスが公開されることはありません。 が付いている欄は必須項目です

このサイトはスパムを低減するために Akismet を使っています。コメントデータの処理方法の詳細はこちらをご覧ください

モバイルバージョンを終了