[WIP PATCH] SR RAID1 checksumming support

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[WIP PATCH] SR RAID1 checksumming support

Karel Gardas
Hello,

attached my work in progress on checksumming support for softraid
RAID1. Currently it does just:
- computation of checksums (crc32)
- verification of checksums
- signal bad checksum to console and to sensors

E.g.:
$ sysctl hw.sensors.softraid0
hw.sensors.softraid0.raw0=0 (sd0f), OK
hw.sensors.softraid0.raw1=0 (sd0g), OK
hw.sensors.softraid0.drive0=online (sd1), OK


Next TODO items:
- hang-over to another chunk (restart wu) in case of checksum error
- properly handle errors hapenning on all chunks
- "self-healing" of bad sector

Note: checksums are computed per sector basis, saved in the area
allocated at the end of the drive. Due to this design,
LBA collision detection in softraid.c was enhanced/fixed to support
also this case of application
and currently it may not be compatible with RAID5/6 usage.

Any comments welcome!

Thanks!
Karel

Index: sbin/bioctl/bioctl.8
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.8,v
retrieving revision 1.96
diff -u -p -u -r1.96 bioctl.8
--- sbin/bioctl/bioctl.8    29 May 2015 00:33:37 -0000    1.96
+++ sbin/bioctl/bioctl.8    31 Aug 2015 20:02:47 -0000
@@ -199,6 +199,11 @@ for example, force the creation of volum
 with unclean data in the metadata areas.
 .It Ar noauto
 Do not automatically assemble this volume at boot time.
+.It Ar chksum
+Enforce usage of checksums on the device blocks. The checksum area is
+located at the end of the device data area and since it accupies some
+space it makes actual usable device size smaller. We need exactly 8
+bytes of checksum per device data block.
 .El
 .It Fl c Ar raidlevel
 Create a
Index: sbin/bioctl/bioctl.c
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.c,v
retrieving revision 1.129
diff -u -p -u -r1.129 bioctl.c
--- sbin/bioctl/bioctl.c    18 Jul 2015 23:23:20 -0000    1.129
+++ sbin/bioctl/bioctl.c    31 Aug 2015 20:02:47 -0000
@@ -1053,6 +1053,9 @@ bio_createflags(char *lst)
             case 'n':
                 flags |= BIOC_SCNOAUTOASSEMBLE;
                 break;
+            case 'c':
+                flags |= BIOC_SCCHKSUM;
+                break;
             default:
                 strlcpy(fs, s, sz + 1);
                 errx(1, "invalid flag %s", fs);
Index: sys/dev/biovar.h
===================================================================
RCS file: /cvs/src/sys/dev/biovar.h,v
retrieving revision 1.44
diff -u -p -u -r1.44 biovar.h
--- sys/dev/biovar.h    29 May 2015 00:33:37 -0000    1.44
+++ sys/dev/biovar.h    31 Aug 2015 20:02:49 -0000
@@ -213,6 +213,7 @@ struct bioc_createraid {
 #define BIOC_SCDEVT        0x02    /* dev_t array or string in dev_list */
 #define BIOC_SCNOAUTOASSEMBLE    0x04    /* do not assemble during autoconf */
 #define BIOC_SCBOOTABLE        0x08    /* device is bootable */
+#define BIOC_SCCHKSUM        0x10    /* device provides chksum capability */
     u_int32_t    bc_opaque_size;
     u_int32_t    bc_opaque_flags;
 #define    BIOC_SOINVALID        0x00    /* no opaque pointer */
Index: sys/dev/softraid.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid.c,v
retrieving revision 1.364
diff -u -p -u -r1.364 softraid.c
--- sys/dev/softraid.c    19 Aug 2015 19:05:24 -0000    1.364
+++ sys/dev/softraid.c    31 Aug 2015 20:02:50 -0000
@@ -71,6 +71,7 @@ uint32_t    sr_debug = 0
             /* | SR_D_DIS */
             /* | SR_D_STATE */
             /* | SR_D_REBUILD */
+            /* | SR_D_CHKSUM  */
         ;
 #endif

@@ -144,6 +145,8 @@ int            sr_chunk_in_use(struct sr_softc *,
 int            sr_rw(struct sr_softc *, dev_t, char *, size_t,
                 daddr_t, long);
 void            sr_wu_done_callback(void *);
+int            sr_wu_collision(struct sr_workunit *,
+                struct sr_workunit *);

 /* don't include these on RAMDISK */
 #ifndef SMALL_KERNEL
@@ -2264,6 +2267,9 @@ sr_wu_done_callback(void *xwu)

     s = splbio();

+    DNPRINTF(SR_D_WU, "%s: sr_wu_done: %p\n",
+         DEVNAME(sd->sd_sc), wu);
+
     if (xs != NULL) {
         if (wu->swu_ios_failed)
             xs->error = XS_DRIVER_STUFFUP;
@@ -2286,11 +2292,54 @@ sr_wu_done_callback(void *xwu)
     TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);

     if (wu->swu_collider) {
-        if (wu->swu_ios_failed)
-            sr_raid_recreate_wu(wu->swu_collider);
+        DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider: %p\n",
+             DEVNAME(sd->sd_sc), wu->swu_collider);
+        if (wu->swu_ios_failed) {
+          DNPRINTF(SR_D_WU, "%s: sr_wu_done, recreate collider?: %p WHY???\n",
+               DEVNAME(sd->sd_sc), wu->swu_collider);
+          sr_raid_recreate_wu(wu->swu_collider);
+        }
+        /*
+         * We're searching for wu which do have the same collider
+         * like current wu. If we find such wu we can continue
+         * without starting the collider. If we do not find such wu
+         * then we need to start the collieder as the current wu is
+         * the last wu the collider collides with.
+         */
+        int found = 0;
+        DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider:"
+            " %p\n", DEVNAME(sd->sd_sc), wu->swu_collider);
+        TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
+            if (wu->swu_collider == wup->swu_collider) {
+                DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+                    " collider in wu: %p\n",
+                    DEVNAME(sd->sd_sc), wup);
+                found++;
+                break;
+            }
+        }
+        TAILQ_FOREACH(wup, &sd->sd_wu_defq, swu_link) {
+            if (wu->swu_collider == wup->swu_collider) {
+                DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+                    " collider in def wu: %p\n",
+                    DEVNAME(sd->sd_sc), wup);
+                found++;
+                break;
+            }
+        }
+        DNPRINTF(SR_D_WU, "%s: sr_wu_done, collider found: %d\n",
+            DEVNAME(sd->sd_sc), found);

-        /* XXX Should the collider be failed if this xs failed? */
-        sr_raid_startwu(wu->swu_collider);
+        if (found == 0) {
+            /* The current wu is the last wu colliding
+               with the collider. */
+            DNPRINTF(SR_D_WU, "%s: sr_wu_done, starting the collider: %p\n",
+                DEVNAME(sd->sd_sc), wu->swu_collider);
+            sr_raid_startwu(wu->swu_collider);
+        }
+        else {
+            wu->swu_collider = NULL;
+        }
     }

     /*
@@ -3967,6 +4016,7 @@ sr_discipline_init(struct sr_discipline
     sd->sd_set_chunk_state = sr_set_chunk_state;
     sd->sd_set_vol_state = sr_set_vol_state;
     sd->sd_start_discipline = NULL;
+    sd->sd_wu_collision_detection = NULL;

     task_set(&sd->sd_meta_save_task, sr_meta_save_callback, sd);
     task_set(&sd->sd_hotspare_rebuild_task, sr_hotspare_rebuild_callback,
@@ -4181,11 +4231,30 @@ sr_raid_intr(struct buf *bp)
     splx(s);
 }

+int
+sr_wu_collision(struct sr_workunit *wu1, struct sr_workunit *wu2)
+{
+    struct sr_discipline    *sd = wu1->swu_dis;
+
+    if (sd->sd_wu_collision_detection) {
+        return sd->sd_wu_collision_detection(wu1, wu2);
+    }
+    else if (wu1->swu_blk_end < wu2->swu_blk_start ||
+        wu2->swu_blk_end < wu1->swu_blk_start) {
+        return 0;
+    }
+    else {
+        return 1;
+    }
+}
+
 void
 sr_schedule_wu(struct sr_workunit *wu)
 {
     struct sr_discipline    *sd = wu->swu_dis;
     struct sr_workunit    *wup;
+    struct sr_workunit    *twup;
+
     int            s;

     DNPRINTF(SR_D_WU, "sr_schedule_wu: schedule wu %p state %i "
@@ -4210,20 +4279,53 @@ sr_schedule_wu(struct sr_workunit *wu)
     if (wu->swu_state != SR_WU_INPROGRESS)
         panic("sr_schedule_wu: work unit not in progress (state %i)\n",
             wu->swu_state);
+    /*
+     * Walk both pending and defferred queues and find colliding wus.
+     * If we find collision we set wu's collider to current wu and push
+     * the current wu into the defferred queue.
+     */
+    int colliding = 0;

-    /* Walk queue backwards and fill in collider if we have one. */
-    TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) {
-        if (wu->swu_blk_end < wup->swu_blk_start ||
-            wup->swu_blk_end < wu->swu_blk_start)
+    TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_pendq, swu_link, twup) {
+        if (!sr_wu_collision(wu, wup))
             continue;

+        colliding = 1;
         /* Defer work unit due to LBA collision. */
-        DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p\n",
-            wu);
-        wu->swu_state = SR_WU_DEFERRED;
+        DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+            " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+            " %lld\n", wu, wup,
+            ((wup->swu_xs != NULL) ?
+            (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+            : "NULL"),
+            wup->swu_blk_start, wup->swu_blk_end);
         while (wup->swu_collider)
             wup = wup->swu_collider;
-        wup->swu_collider = wu;
+
+        if (wup != wu)
+            wup->swu_collider = wu;
+    }
+    TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_defq, swu_link, twup) {
+        if (!sr_wu_collision(wu, wup))
+            continue;
+
+        colliding = 1;
+        DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+            " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+            " %lld\n", wu, wup,
+            ((wup->swu_xs != NULL) ?
+            (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+            : "NULL"),
+            wup->swu_blk_start, wup->swu_blk_end);
+
+        while (wup->swu_collider)
+            wup = wup->swu_collider;
+
+        if (wup != wu)
+            wup->swu_collider = wu;
+    }
+    if (colliding == 1) {
+        wu->swu_state = SR_WU_DEFERRED;
         TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
         sd->sd_wu_collisions++;
         goto queued;
@@ -4845,7 +4947,15 @@ void
 sr_sensors_delete(struct sr_discipline *sd)
 {
     DNPRINTF(SR_D_STATE, "%s: sr_sensors_delete\n", DEVNAME(sd->sd_sc));
-
+    /* first run thorough chunk specific sensors */
+    /* shall we enhance discipline API and add sensor delete function? */
+    int    chdx;
+    for (chdx = 0; chdx < sd->sd_meta->ssdi.ssd_chunk_no; chdx++) {
+        if (sd->sd_vol.sv_chunks[chdx]->src_sensor_attached) {
+            sensor_detach(&sd->sd_sc->sc_sensordev,
+                &sd->sd_vol.sv_chunks[chdx]->src_sensor);
+        }
+    }
     if (sd->sd_vol.sv_sensor_attached)
         sensor_detach(&sd->sd_sc->sc_sensordev, &sd->sd_vol.sv_sensor);
 }
@@ -4856,6 +4966,8 @@ sr_sensors_refresh(void *arg)
     struct sr_softc        *sc = arg;
     struct sr_volume    *sv;
     struct sr_discipline    *sd;
+    struct sr_chunk         *chunk;
+    struct sr_chunk_head    *cl;

     DNPRINTF(SR_D_STATE, "%s: sr_sensors_refresh\n", DEVNAME(sc));

@@ -4882,6 +4994,18 @@ sr_sensors_refresh(void *arg)
         default:
             sv->sv_sensor.value = 0; /* unknown */
             sv->sv_sensor.status = SENSOR_S_UNKNOWN;
+        }
+        /* shall we enhance discipline API and add
+           sensor refresh function? */
+        if (sd->sd_type == SR_MD_RAID1_CHKSUM
+            && sd->mds.mdd_raid1.sr1_use_chksum) {
+          /* refreshing chksum errors sensors */
+          cl = &sv->sv_chunk_list;
+          SLIST_FOREACH(chunk, cl, src_link)
+            if (chunk->src_errs > 0 && chunk->src_sensor_attached == 1) {
+              chunk->src_sensor.value = chunk->src_errs;
+              chunk->src_sensor.status = SENSOR_S_WARN;
+            }
         }
     }
 }
Index: sys/dev/softraid_raid1.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid_raid1.c,v
retrieving revision 1.63
diff -u -p -u -r1.63 softraid_raid1.c
--- sys/dev/softraid_raid1.c    21 Jul 2015 03:30:51 -0000    1.63
+++ sys/dev/softraid_raid1.c    31 Aug 2015 20:02:50 -0000
@@ -41,6 +41,8 @@

 #include <dev/softraidvar.h>

+#include <lib/libz/zlib.h>
+
 /* RAID 1 functions. */
 int    sr_raid1_create(struct sr_discipline *, struct bioc_createraid *,
         int, int64_t);
@@ -48,15 +50,49 @@ int    sr_raid1_assemble(struct sr_discipli
         int, void *);
 int    sr_raid1_init(struct sr_discipline *sd);
 int    sr_raid1_rw(struct sr_workunit *);
+int    sr_raid1_openings(struct sr_discipline *);
 int    sr_raid1_wu_done(struct sr_workunit *);
 void    sr_raid1_set_chunk_state(struct sr_discipline *, int, int);
 void    sr_raid1_set_vol_state(struct sr_discipline *);
+int    sr_raid1_wu_collision_detection(struct sr_workunit *,
+        struct sr_workunit *);
+
+/* internal functions */
+int    sr_raid1_addio(struct sr_workunit *, int, daddr_t, daddr_t, void *,
+        int, int, void *);
+void    sr_raid1_intr(struct buf *);
+int    sr_raid1_verify_chksum(void*, int, daddr_t, daddr_t, void*);
+uLong    sr_raid1_update_chksum(void*, int, daddr_t, daddr_t, void*);
+int    sr_raid1_sensor_create(struct sr_discipline *, int);
+daddr_t    sr_raid1_chksum_blk_start(struct sr_workunit *);
+daddr_t    sr_raid1_chksum_blk_end(struct sr_workunit *);
+size_t    sr_raid1_chksum_data_len(struct sr_workunit *);
+
+
+#define CHKSUM_SIZE 8
+#define CHKSUM_IN_BLOCK (DEV_BSIZE / 8)
+
+
+struct sr_raid1c_opaque {
+    /* 0 == read, 1 == write */
+    int        write;
+    void        *data;
+    int        len;
+    daddr_t        blk_start;
+    daddr_t        blk_end;
+    void        *chksum_data;
+};
+

 /* Discipline initialisation. */
 void
 sr_raid1_discipline_init(struct sr_discipline *sd)
 {
     /* Fill out discipline members. */
+    /*
+     * For now we assume run without check sums, if this is not true
+     * we will correct values in _create or _assembly functions.
+     */
     sd->sd_type = SR_MD_RAID1;
     strlcpy(sd->sd_name, "RAID 1", sizeof(sd->sd_name));
     sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
@@ -70,20 +106,48 @@ sr_raid1_discipline_init(struct sr_disci
     sd->sd_scsi_wu_done = sr_raid1_wu_done;
     sd->sd_set_chunk_state = sr_raid1_set_chunk_state;
     sd->sd_set_vol_state = sr_raid1_set_vol_state;
+    sd->sd_scsi_intr = sr_raid1_intr;
 }

 int
 sr_raid1_create(struct sr_discipline *sd, struct bioc_createraid *bc,
     int no_chunk, int64_t coerced_size)
 {
+    int ch;
     if (no_chunk < 2) {
         sr_error(sd->sd_sc, "%s requires two or more chunks",
             sd->sd_name);
         return EINVAL;
     }
-
-    sd->sd_meta->ssdi.ssd_size = coerced_size;
-
+    if (bc->bc_flags & BIOC_SCCHKSUM) {
+        int64_t chksum_area_size = coerced_size * CHKSUM_SIZE
+            / DEV_BSIZE;
+        if (((coerced_size * CHKSUM_SIZE) % DEV_BSIZE) != 0) {
+            chksum_area_size++;
+        }
+        DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld,"
+            " data size: %lld, chksum area size: %lld\n, ",
+            coerced_size, coerced_size - chksum_area_size,
+            chksum_area_size);
+        sd->sd_meta->ssdi.ssd_size = coerced_size - chksum_area_size;
+        sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+        sd->mds.mdd_raid1.sr1_use_chksum = 1;
+        /* fixing discipline values for chksum support */
+        sd->sd_type = SR_MD_RAID1_CHKSUM;
+        strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+        sd->sd_openings = sr_raid1_openings;
+        sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+        for (ch = 0; ch < no_chunk; ch++) {
+            if (sr_raid1_sensor_create(sd, ch)) {
+                DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+                    " created for chunk: %d\n", ch);
+            }
+        }
+    }
+    else {
+        sd->sd_meta->ssdi.ssd_size = coerced_size;
+        sd->mds.mdd_raid1.sr1_use_chksum = 0;
+    }
     return sr_raid1_init(sd);
 }

@@ -91,17 +155,55 @@ int
 sr_raid1_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
     int no_chunk, void *data)
 {
+    int ch;
+    if (bc->bc_flags & BIOC_SCCHKSUM) {
+        int64_t coerced_size = sd->sd_vol.sv_chunks[0]
+            ->src_meta.scmi.scm_coerced_size;
+        DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld\n, ",
+            coerced_size);
+        sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+        sd->mds.mdd_raid1.sr1_use_chksum = 1;
+        /* fixing discipline values for chksum support */
+        sd->sd_type = SR_MD_RAID1_CHKSUM;
+        strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+        sd->sd_openings = sr_raid1_openings;
+        sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+        for (ch = 0; ch < no_chunk; ch++) {
+            if (sr_raid1_sensor_create(sd, ch)) {
+                DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+                    " created for chunk: %d\n", ch);
+            }
+        }
+    }
+    else {
+        sd->mds.mdd_raid1.sr1_use_chksum = 0;
+    }
     return sr_raid1_init(sd);
 }

 int
 sr_raid1_init(struct sr_discipline *sd)
 {
-    sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
-
+    if (sd->mds.mdd_raid1.sr1_use_chksum) {
+        /*
+         * In case of chksum support we use two ccbs per chunk
+         * for read and write.
+         */
+        sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no * 2;
+    }
+    else {
+        sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
+    }
     return 0;
 }

+int
+sr_raid1_openings(struct sr_discipline *sd)
+{
+    /* Max two work units per I/O (in case of write) */
+    return sd->sd_max_wu >> 1;
+}
+
 void
 sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
 {
@@ -324,17 +426,24 @@ die:
 int
 sr_raid1_rw(struct sr_workunit *wu)
 {
+    struct sr_workunit    *wu_r_chksum = NULL;
     struct sr_discipline    *sd = wu->swu_dis;
     struct scsi_xfer    *xs = wu->swu_xs;
     struct sr_ccb        *ccb;
     struct sr_chunk        *scp;
     int            ios, chunk, i, rt;
     daddr_t            blkno;
+    int use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;

     /* blkno and scsi error will be handled by sr_validate_io */
     if (sr_validate_io(wu, &blkno, "sr_raid1_rw"))
         goto bad;

+    DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw: blkno: %lld, len: %d, %s,"
+        " swu_block_start %lld, swu_block_end: %lld\n", blkno, xs->datalen,
+        (xs->flags & SCSI_DATA_IN) ? "READ" : "WRITE", wu->swu_blk_start,
+        wu->swu_blk_end);
+
     if (xs->flags & SCSI_DATA_IN)
         ios = 1;
     else
@@ -368,8 +477,8 @@ ragain:
             }
         } else {
             /* writes go on all working disks */
-            chunk = i;
-            scp = sd->sd_vol.sv_chunks[chunk];
+                chunk = i;
+                scp = sd->sd_vol.sv_chunks[chunk];
             switch (scp->src_meta.scm_status) {
             case BIOC_SDONLINE:
             case BIOC_SDSCRUB:
@@ -384,25 +493,155 @@ ragain:
                 goto bad;
             }
         }
-
-        ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
-            xs->flags, 0);
-        if (!ccb) {
-            /* should never happen but handle more gracefully */
-            printf("%s: %s: too many ccbs queued\n",
-                DEVNAME(sd->sd_sc),
-                sd->sd_meta->ssd_devname);
-            goto bad;
+        if (use_chksum) {
+            daddr_t chksum_blk = sr_raid1_chksum_blk_start(wu);
+            void* chksum_data = NULL;
+            size_t chksum_data_len = sr_raid1_chksum_data_len(wu);
+            if (xs->flags & SCSI_DATA_IN) {
+                /* read data */
+                if (sr_raid1_addio(wu, chunk, blkno,
+                    xs->datalen, xs->data, xs->flags, 0, 0)) {
+                    printf("%s: %s: too many ccbs queued"
+                        " (2c)\n", DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                    goto bad;
+                }
+                chksum_data = sr_block_get(sd, DEV_BSIZE
+                    * chksum_data_len);
+                if (!chksum_data) {
+                    goto bad;
+                }
+                struct sr_raid1c_opaque *chksum_info = malloc(
+                    sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+                    M_ZERO | M_NOWAIT);
+                if (!chksum_info) {
+                    panic("%s: %s: can't allocate"
+                        " chksum_info structure\n",
+                        DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                }
+                chksum_info->write = 0;
+                chksum_info->blk_start = wu->swu_blk_start;
+                chksum_info->blk_end = wu->swu_blk_end;
+                chksum_info->data = xs->data;
+                chksum_info->len = xs->datalen;
+                chksum_info->chksum_data = chksum_data;
+                /* read chksum */
+                if (sr_raid1_addio(wu, chunk, chksum_blk,
+                    DEV_BSIZE * chksum_data_len, chksum_data,
+                    SCSI_DATA_IN, 0, chksum_info)) {
+                    sr_block_put(sd, chksum_data,
+                        DEV_BSIZE * chksum_data_len);
+                    printf("%s: %s: too many ccbs queued"
+                        " (2c)\n", DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                    goto bad;
+                }
+            }
+            else {
+                /* write with chksum */
+                struct sr_raid1c_opaque *chksum_info = malloc(
+                    sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+                    M_ZERO | M_NOWAIT);
+                if (!chksum_info) {
+                    panic("%s: %s: can't allocate"
+                        " chksum_info structure\n",
+                        DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                }
+                if (!wu_r_chksum) {
+                    if ((wu_r_chksum = sr_scsi_wu_get(sd,
+                        SCSI_NOSLEEP)) == NULL) {
+                        printf("%s: %s failed to get"
+                            " read work unit\n",
+                            DEVNAME(sd->sd_sc),
+                            sd->sd_meta->ssd_devname);
+                        goto bad;
+                    }
+                    wu_r_chksum->swu_state
+                        = SR_WU_INPROGRESS;
+                    wu_r_chksum->swu_flags
+                        |= SR_WUF_DISCIPLINE;
+                    wu_r_chksum->swu_blk_start
+                        = sr_raid1_chksum_blk_start(wu);
+                    wu_r_chksum->swu_blk_end
+                        = sr_raid1_chksum_blk_end(wu);
+                    DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw:"
+                        " wu_r_chksum: %p\n", wu_r_chksum);
+                }
+                chksum_data = sr_block_get(sd,
+                    DEV_BSIZE * chksum_data_len);
+                if (!chksum_data) {
+                    goto bad;
+                }
+                chksum_info->write = 1;
+                chksum_info->blk_start = wu->swu_blk_start;
+                chksum_info->blk_end = wu->swu_blk_end;
+                chksum_info->data = xs->data;
+                chksum_info->len = xs->datalen;
+                chksum_info->chksum_data = chksum_data;
+                DNPRINTF(SR_D_CHKSUM, "rw: chksum_info: %p\n",
+                    chksum_info);
+                /* read chksum */
+                if (sr_raid1_addio(wu_r_chksum, chunk,
+                    chksum_blk, DEV_BSIZE * chksum_data_len,
+                    chksum_data, SCSI_DATA_IN, 0,
+                    chksum_info)) {
+                    sr_block_put(sd, chksum_data,
+                        DEV_BSIZE * chksum_data_len);
+                    printf("%s: %s: too many ccbs queued"
+                        " (2c)\n", DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                    goto bad;
+                }
+                /* write data */
+                if (sr_raid1_addio(wu, chunk, blkno,
+                    xs->datalen, xs->data, xs->flags, 0, 0)) {
+                    printf("%s: %s: too many ccbs queued"
+                        " (2c)\n", DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                    goto bad;
+                }
+                /* write chksum */
+                if (sr_raid1_addio(wu, chunk, chksum_blk,
+                    DEV_BSIZE * chksum_data_len, chksum_data,
+                    xs->flags, SR_CCBF_FREEBUF, 0)) {
+                    printf("%s: %s: too many ccbs queued"
+                        " (2c)\n", DEVNAME(sd->sd_sc),
+                        sd->sd_meta->ssd_devname);
+                    goto bad;
+                }
+            }
+        }
+        else {
+          /* RAID 1 without chksum support */
+          ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
+                  xs->flags, 0);
+          if (!ccb) {
+            /* should never happen but handle more gracefully */
+            printf("%s: %s: too many ccbs queued\n",
+               DEVNAME(sd->sd_sc),
+               sd->sd_meta->ssd_devname);
+            goto bad;
+          }
+
+          sr_wu_enqueue_ccb(wu, ccb);
         }
-        sr_wu_enqueue_ccb(wu, ccb);
     }
-
+    if (wu_r_chksum) {
+      /* collide write request with chksum reads */
+      wu_r_chksum->swu_blk_start = wu->swu_blk_start;
+      wu_r_chksum->swu_blk_end = wu->swu_blk_end;
+      sr_schedule_wu(wu_r_chksum);
+    }
     sr_schedule_wu(wu);

     return (0);

 bad:
     /* wu is unwound by sr_wu_put */
+    if (wu_r_chksum)
+        sr_scsi_wu_put(sd, wu_r_chksum);
     return (1);
 }

@@ -411,7 +650,83 @@ sr_raid1_wu_done(struct sr_workunit *wu)
 {
     struct sr_discipline    *sd = wu->swu_dis;
     struct scsi_xfer    *xs = wu->swu_xs;
+    struct sr_ccb           *ccb = NULL;
+
+    /* XXX - we have no way of propagating errors... */
+    if (wu->swu_flags & SR_WUF_DISCIPLINE) {
+        DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done (read chksum):"
+            " %p, blk_s: %lld, blk_e: %lld\n", wu, wu->swu_blk_start,
+            wu->swu_blk_end);
+        /*
+         * This is read chksum wu for a data write wu, we need to
+         * free ccb->ccb_opaque which is checksum_info here since
+         * ccb_buf with chksum data is passed directly to write
+         * and we do not need chksum_info anymore.
+         */
+        TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+            if (ccb->ccb_opaque) {
+                struct sr_raid1c_opaque *chksum_info
+                    = ccb->ccb_opaque;
+                DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done: free"
+                    " chksum_info: %p\n", chksum_info);
+                free(chksum_info, M_DEVBUF, 0);
+                ccb->ccb_opaque = NULL;
+            }
+        }
+        return SR_WU_OK;
+    }
+
+    if (wu->swu_ios_complete != wu->swu_io_count)
+      return SR_WU_INPROGRESS;

+
+    if (xs->flags & SCSI_DATA_IN) {
+        /* read: verify chksum */
+        TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+            if (ccb->ccb_opaque != NULL) {
+                /* ccb is chksum ccb */
+                struct sr_raid1c_opaque *chksum_info
+                    = ccb->ccb_opaque;
+                if (sr_raid1_verify_chksum(chksum_info->data,
+                    chksum_info->len, chksum_info->blk_start,
+                    chksum_info->blk_end,
+                    ccb->ccb_buf.b_data)) {
+                    wu->swu_state = SR_WU_CHKSUMFAILED;
+                    DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done"
+                        ": verify failed on area %lld-%lld"
+                        " with wu state: %d and flags: %d"
+                        " on chunk: %d\n",
+                        wu->swu_blk_start, wu->swu_blk_end,
+                        wu->swu_state, wu->swu_flags,
+                        ccb->ccb_target);
+                    /* update chunk error value */
+                    if (ccb->ccb_target != -1) {
+                        sd->sd_vol.sv_chunks
+                            [ccb->ccb_target]
+                            ->src_errs++;
+                    }
+                }
+                /* free chksum ccb buf */
+                DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done:"
+                    " sr_block_put:"
+                    " %p, size: %ld\n", ccb->ccb_buf.b_data,
+                    ccb->ccb_buf.b_bcount);
+                sr_block_put(sd, ccb->ccb_buf.b_data,
+                    ccb->ccb_buf.b_bcount);
+                ccb->ccb_buf.b_data = NULL;
+                free(chksum_info, M_DEVBUF, 0);
+                ccb->ccb_opaque = NULL;
+            }
+        }
+    }
+
+    if (wu->swu_state == SR_WU_CHKSUMFAILED) {
+        ccb = TAILQ_FIRST(&wu->swu_ccb);
+        printf("%s: chunk: %d: verify chksum failed on %lld-%lld"
+            " block(s)\n",
+            sd->sd_meta->ssd_devname, ccb->ccb_target,
+            wu->swu_blk_start, wu->swu_blk_end);
+    }
     /* If at least one I/O succeeded, we are okay. */
     if (wu->swu_ios_succeeded > 0) {
         xs->error = XS_NOERROR;
@@ -438,4 +753,230 @@ sr_raid1_wu_done(struct sr_workunit *wu)
     xs->error = XS_DRIVER_STUFFUP;

     return SR_WU_FAILED;
+}
+
+int
+sr_raid1_wu_collision_detection(struct sr_workunit *wu1,
+    struct sr_workunit *wu2)
+{
+    if (wu1 == NULL || wu2 == NULL)
+        return 0;
+
+    daddr_t wu1_chksum_blk_start = sr_raid1_chksum_blk_start(wu1);
+    daddr_t wu1_chksum_blk_end = sr_raid1_chksum_blk_end(wu1);
+    daddr_t wu2_chksum_blk_start = sr_raid1_chksum_blk_start(wu2);
+    daddr_t wu2_chksum_blk_end = sr_raid1_chksum_blk_end(wu2);
+
+    if (wu1->swu_blk_end < wu2->swu_blk_start
+        || wu2->swu_blk_end < wu1->swu_blk_start) {
+        /* data blocks do not colide, let's test chksum blocks */
+        if (wu1_chksum_blk_end < wu2_chksum_blk_start
+            || wu2_chksum_blk_end < wu1_chksum_blk_start) {
+            return 0;
+        }
+    }
+    DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_collision_detection: collision"
+        " found! wu1: %p, blk_s: %lld, blk_e: %lld,"
+        " chksum_blk_s: %lld, chksum_blk_e: %lld, wu2: %p,"
+        " blk_s: %lld, blk_e: %lld, chksum_blk_s: %lld, chksum_blk_e:"
+        " %lld\n", wu1, wu1->swu_blk_start, wu1->swu_blk_end,
+        wu1_chksum_blk_start, wu1_chksum_blk_end, wu2, wu2->swu_blk_start,
+        wu2->swu_blk_end, wu2_chksum_blk_start, wu2_chksum_blk_end);
+    return 1;
+}
+
+int
+sr_raid1_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
+    daddr_t len, void *data, int xsflags, int ccbflags, void *chksumbuf)
+{
+    struct sr_discipline    *sd = wu->swu_dis;
+    struct sr_ccb        *ccb;
+    DNPRINTF(SR_D_CHKSUM, "sr_raid1_addio: %s chunk %d block %lld "
+        "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
+        chunk, (long long)blkno, (long long)len,
+        chksumbuf ? "CHKSUM" : "-");
+    /* Allocate temporary buffer. */
+    if (data == NULL) {
+        data = sr_block_get(sd, len);
+        if (data == NULL)
+            return (-1);
+        ccbflags |= SR_CCBF_FREEBUF;
+    }
+
+    ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
+    if (ccb == NULL) {
+        if (ccbflags & SR_CCBF_FREEBUF)
+            sr_block_put(sd, data, len);
+        return (-1);
+    }
+    ccb->ccb_opaque = chksumbuf;
+    sr_wu_enqueue_ccb(wu, ccb);
+
+    return (0);
+}
+
+void
+sr_raid1_intr(struct buf *bp)
+{
+    struct sr_ccb        *ccb = (struct sr_ccb *)bp;
+    struct sr_workunit    *wu = ccb->ccb_wu;
+    struct sr_discipline    *sd = wu->swu_dis;
+    int            s;
+    int            use_chksum;
+
+    use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
+
+    DNPRINTF(SR_D_INTR, "%s: sr_raid1_intr bp %p xs %p\n",
+        DEVNAME(sd->sd_sc), bp, wu->swu_xs);
+
+    s = splbio();
+    sr_ccb_done(ccb);
+
+    if (use_chksum && ccb->ccb_state == SR_CCB_OK
+        && ccb->ccb_opaque) {
+        struct sr_raid1c_opaque *chksum_info = ccb->ccb_opaque;
+        if (chksum_info->write == 1) {
+            /* let's update read chksum for provided data */
+            sr_raid1_update_chksum(chksum_info->data,
+                chksum_info->len, chksum_info->blk_start,
+                chksum_info->blk_end, ccb->ccb_buf.b_data);
+        }
+    }
+    /* Free allocated data buffer. */
+    if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
+        sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
+        ccb->ccb_buf.b_data = NULL;
+    }
+    sr_wu_done(wu);
+    splx(s);
+}
+
+int
+sr_raid1_verify_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+    void* chksum_buf)
+{
+    int32_t        chksum_n = blk_start % CHKSUM_IN_BLOCK;
+    int32_t        chksum_count = blk_end - blk_start + 1;
+    uLong        *chksum = chksum_buf;
+    Bytef        *buf = data;
+    int32_t        i;
+
+    for (i = 0; i < chksum_count; i++) {
+        uLong crc = crc32(0L, Z_NULL, 0);
+        crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+        if (crc != chksum[chksum_n]) {
+            DNPRINTF(SR_D_CHKSUM, "verify failed on comparison"
+                " block chksum(%ld) and saved chksum(%ld)[%d],"
+                " called for start: %lld, end: %lld, n: %d, failed"
+                " block: %lld\n", crc, chksum[chksum_n], chksum_n,
+                blk_start, blk_end, chksum_n, blk_start + i);
+            return (-1);
+        }
+        chksum_n++;
+    }
+    return 0;
+}
+
+uLong
+sr_raid1_update_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+    void* chksum_buf)
+{
+    int32_t        chksum_n = blk_start % CHKSUM_IN_BLOCK;
+    int32_t        chksum_count = blk_end - blk_start + 1;
+    uLong        *chksum = chksum_buf;
+    Bytef        *buf = data;
+    int32_t        i;
+
+    DNPRINTF(SR_D_CHKSUM, "update chksum: start: %lld, end: %lld, n: %d\n",
+        blk_start, blk_end, chksum_n);
+    DNPRINTF(SR_D_CHKSUM, "blocks chksumed: ");
+    for (i = 0; i < chksum_count; i++) {
+        uLong crc = crc32(0L, Z_NULL, 0);
+        crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+        chksum[chksum_n] = crc;
+        DNPRINTF(SR_D_CHKSUM, "%lld->(%ld)[%d], ", (blk_start + i),
+            crc, chksum_n);
+        chksum_n++;
+    }
+    DNPRINTF(SR_D_CHKSUM, "\n");
+    return 0;
+}
+
+int
+sr_raid1_sensor_create(struct sr_discipline* sd, int chno)
+{
+    struct sr_chunk        *chunk;
+
+    chunk = sd->sd_vol.sv_chunks[chno];
+    chunk->src_errs = 0;
+    chunk->src_sensor.type = SENSOR_INTEGER;
+    chunk->src_sensor.status = SENSOR_S_OK;
+    strlcpy(chunk->src_sensor.desc, chunk->src_devname,
+        sizeof(chunk->src_sensor.desc));
+    sensor_attach(&sd->sd_sc->sc_sensordev, &chunk->src_sensor);
+    chunk->src_sensor_attached = 1;
+    return 0;
+}
+
+daddr_t
+sr_raid1_chksum_blk_start(struct sr_workunit *wu)
+{
+    struct sr_discipline    *sd = wu->swu_dis;
+    daddr_t            blkno = wu->swu_blk_start;
+    daddr_t            chksum_blk_start;
+
+    chksum_blk_start = sd->sd_meta->ssdi.ssd_size
+        + (blkno / CHKSUM_IN_BLOCK);
+
+    return chksum_blk_start;
+}
+
+daddr_t
+sr_raid1_chksum_blk_end(struct sr_workunit *wu)
+{
+    return sr_raid1_chksum_blk_start(wu)
+        + sr_raid1_chksum_data_len(wu) - 1;
+}
+
+size_t
+sr_raid1_chksum_data_len(struct sr_workunit* wu)
+{
+    size_t        chksum_len;
+    size_t        chksum_offset;
+    size_t        chksum_data_len;
+
+    chksum_len = (wu->swu_blk_end - wu->swu_blk_start + 1) * CHKSUM_SIZE;
+    chksum_offset = (wu->swu_blk_start % CHKSUM_IN_BLOCK) * CHKSUM_SIZE;
+    chksum_data_len = (chksum_len + chksum_offset) / DEV_BSIZE + 1;
+
+    return chksum_data_len;
+}
+
+/* debug code used in softraid.c directly */
+void
+sr_raid1_print_wu(struct sr_workunit *wu, int f, const char* msg)
+{
+    DNPRINTF(f, msg);
+    if (wu->swu_flags & SR_WUF_DISCIPLINE) {
+        DNPRINTF(f, "(read chksum) %p, blk_s: %lld, blk_e: %lld,"
+            " chksum_s: %lld, chksum_e: %lld\n", wu, wu->swu_blk_start,
+            wu->swu_blk_end, sr_raid1_chksum_blk_start(wu),
+            sr_raid1_chksum_blk_end(wu));
+    }
+    else {
+        if (wu->swu_xs->flags & SCSI_DATA_IN) {
+            DNPRINTF(f, "(read) %p, blk_s: %lld, blk_e: %lld,"
+                " chksum_s: %lld, chksum_e: %lld\n", wu,
+                wu->swu_blk_start, wu->swu_blk_end,
+                sr_raid1_chksum_blk_start(wu),
+                sr_raid1_chksum_blk_end(wu));
+        }
+        else {
+            DNPRINTF(f, "(write) %p, blk_s: %lld, blk_e: %lld,"
+                " chksum_s: %lld, chksum_e: %lld\n", wu,
+                wu->swu_blk_start, wu->swu_blk_end,
+                sr_raid1_chksum_blk_start(wu),
+                sr_raid1_chksum_blk_end(wu));
+        }
+    }
 }
Index: sys/dev/softraidvar.h
===================================================================
RCS file: /cvs/src/sys/dev/softraidvar.h,v
retrieving revision 1.161
diff -u -p -u -r1.161 softraidvar.h
--- sys/dev/softraidvar.h    21 Jul 2015 03:30:51 -0000    1.161
+++ sys/dev/softraidvar.h    31 Aug 2015 20:02:50 -0000
@@ -307,7 +307,7 @@ SLIST_HEAD(sr_boot_volume_head, sr_boot_

 #define DEVNAME(_s)     ((_s)->sc_dev.dv_xname)

-/* #define SR_DEBUG */
+#define SR_DEBUG
 #ifdef SR_DEBUG
 extern u_int32_t        sr_debug;
 #define DPRINTF(x...)        do { if (sr_debug) printf(x); } while(0)
@@ -322,6 +322,7 @@ extern u_int32_t        sr_debug;
 #define    SR_D_DIS        0x0080
 #define    SR_D_STATE        0x0100
 #define    SR_D_REBUILD        0x0200
+#define    SR_D_CHKSUM        0x0400
 #else
 #define DPRINTF(x...)
 #define DNPRINTF(n,x...)
@@ -378,6 +379,7 @@ struct sr_workunit {
 #define SR_WU_RESTART        7
 #define SR_WU_REQUEUE        8
 #define SR_WU_CONSTRUCT        9
+#define    SR_WU_CHKSUMFAILED    10

     int            swu_flags;    /* additional hints */
 #define SR_WUF_REBUILD        (1<<0)        /* rebuild io */
@@ -426,6 +428,10 @@ struct sr_raid0 {
 #define SR_RAID1_NOWU        16
 struct sr_raid1 {
     u_int32_t        sr1_counter;
+    u_int32_t        sr1_use_chksum;    /* are checksum in use? */
+
+    /* original coerced size in blocks */
+    int64_t            sr1_coerced_size;
 };

 /* RAID 5 */
@@ -474,6 +480,10 @@ struct sr_chunk {
     u_char            src_duid[8];    /* Chunk disklabel UID. */
     int64_t            src_size;    /* in blocks */

+    struct ksensor        src_sensor;    /* Chunk specific sensor */
+    int            src_sensor_attached;
+    int            src_errs;    /* Errors counter value */
+
     SLIST_ENTRY(sr_chunk)    src_link;
 };

@@ -503,6 +513,7 @@ struct sr_discipline {
     /* SR_MD_RAID4 was 7. */
 #define    SR_MD_RAID6        8
 #define    SR_MD_CONCAT        9
+#define    SR_MD_RAID1_CHKSUM    10
     char            sd_name[10];    /* human readable dis name */
     u_int16_t        sd_target;    /* scsibus target discipline uses */

@@ -512,6 +523,7 @@ struct sr_discipline {
 #define SR_CAP_REBUILD        0x00000004    /* Supports rebuild. */
 #define SR_CAP_NON_COERCED    0x00000008    /* Uses non-coerced size. */
 #define SR_CAP_REDUNDANT    0x00000010    /* Redundant copies of data. */
+#define SR_CAP_CHKSUM        0x00000020    /* Check sums of data. */

     union {
         struct sr_raid0    mdd_raid0;
@@ -583,6 +595,9 @@ struct sr_discipline {
     int            (*sd_meta_opt_handler)(struct sr_discipline *,
                     struct sr_meta_opt_hdr *);
     void            (*sd_rebuild)(struct sr_discipline *);
+
+    int            (*sd_wu_collision_detection)(
+                    struct sr_workunit *, struct sr_workunit *);

     /* SCSI emulation */
     struct scsi_sense_data    sd_scsi_sense;