Multiple Device Driver (Linux Software RAID) Ted Baker Andy Wang

advertisement
Multiple Device Driver
(Linux Software RAID)
Ted Baker  Andy Wang
CIS 4930 / COP 5641
The md driver

Provides virtual devices


Created from one or more independent
underlying devices
The basic mechanism to support RAIDs

Redundant arrays of inexpensive disks
Common RAID levels

RAID0


Striping
Mirroring
RAID4 (> 3 disks)

Striped array with a
parity device
RAID5 (> 3 disks)

RAID1




Striped array with
distributed parity
RAID6 (> 4 disks)

Striped array with
dual redundancy
information
Common RAID levels

RAID1+0


RAID0+1


Striped array of
mirrored disks
Mirroring two
RAID0s
RAID5+0

Striped array of
RAID5s

RAID5+1

Mirroring two
RAID5s
md pseudo RAID
configurations


Linear (catenates multiple disks into a
single one)
Multipath


A set of different interfaces to the same
device (e.g., multiple disk controllers)
Faulty

A layer over a single device into which
errors can be injected
RAID Creation
> mdadm --create /dev/md0 --level=1 --raid-devices=2
/dev/hd[ac]1

Create /dev/md0 as RAID1

Consisting of /dev/hda1 and /dev/hdc1
RAID Status

To check the status for RAIDs

See /proc/mdstat
Personalities : [raid1]
md0 : active raid1 sda5[0] sdb5[1]
979840 blocks [2/2] [UU]
md1 : active raid1 sda6[2] sdb6[1]
159661888 blocks [2/1] [_U]
[===>.................] recovery = 17.9%
(28697920/159661888) finish=56.4min speed=38656K/sec
unused devices: <none>
md Super Block

Each device in a RAID may have a
superblock with various information


Level
UUID

128 bit identifier that identifies an array
Some RAID Concepts

Personality



RAID level
Stripe

Chunk size


Power of two > 4KB
A RAID assigns
chunks to disks in a
round robin fashion

A collection of ith
chunk at each disk
form a stripe
Parity

A chunk constructed
via XORing other
chunks
Synchrony


An update may involve both the data
block and the parity block
Implications


A RAID may be shut down in an
inconsistency state
Resynchronization may be required at
startup, in the background

Reduced performance
Recovery

If the md driver detects a write error, it
immediately disables that device


Continues operation on the remaining
devices
Starts recreating the content if there is a
spare drive
Recovery

If the md driver detects a read error


Overwrites the bad block
Read the block again


If fails, treat it as a write error
Recovery is a background process

Can be configured via


/proc/sys/dev/raid/speed_limit_min
/proc/sys/dev/raid/speed_limit_max
Bitmap Write-Intent Logging



Records which blocks of the array may
be out of sync
Speeds up resynchronization
Allows a disk to be temporarily removed
and reinserted without causing an
enormous recovery cost

Can spin down disks for power savings
Bitmap Write-Intent Logging

Can be stored on a separate device
Write-Behind


Certain devices in the array can be
flagged as write-mostly
md will not wait for writes to writebehind devices to complete before
returning to the file system
Restriping (Reshaping)



Change the number of disks
Change the RAID levels
Not robust against failures
faulty.c
static int __init raid_init(void) {
return register_md_personality(&faulty_personality);
}
static void raid_exit(void) {
unregister_md_personality(&faulty_personality);
}
module_init(raid_init);
module_exit(raid_exit);
faulty.c
static struct mdk_personality faulty_personality = {
.name
= "faulty",
.level
= LEVEL_FAULTY,
.owner
= THIS_MODULE,
.make_request
= make_request,
.run
= run,
.stop
= stop,
.status
= status,
.check_reshape = reshape,
.size
= faulty_size
};
faulty.c
static int run(mddev_t *mddev) {
mdk_rdev_t *rdev;
struct list_head *tmp;
int i;
typedef struct faulty_conf {
int period[Modes];
atomic_t counters[Modes];
sector_t faults[MaxFault];
int modes[MaxFault];
int nfaults;
mdk_rdev_t *rdev;
} conf_t;
conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
list head.../* error handling + zero out conf */
A field in
mdk_rdev_t
list_for_each_entry(rdev, mddev, same_set)
conf->rdev = rdev;
md_set_array_sectors(mddev, mddev->dev_sectors);
mddev->private = conf;
reshape(mddev);
return 0;
}
faulty.c
static int reshape(mddev_t *mddev) {
int mode = mddev->new_layout & ModeMask;
int count = mddev->new_layout >> ModeShift;
conf_t *conf = mddev->private;
Total number
.../* error checks */
of failure
if (mode == /* clear something */)
modes (e.g.,
/* clear various counters */
transient
write
} else if (mode < Modes) {
failure
conf->period[mode] = count;
mode)
if (!count) count++;
atomic_set(&conf->counters[mode], count);
} else ...
return 0;
}
faulty.c
static int stop(mddev_t *mddev) {
conf_t *conf = (conf_t *)mddev->private;
kfree(conf);
mddev->private = NULL;
return 0;
}
faulty.c
static int make_request(request_queue_t *q, struct bio *bio) {
mddev_t *mddev = q->queuedata;
conf_t *conf = (conf_t*)mddev->private;
int failit = 0;
if (bio_data_dir(bio) == WRITE) { /* data direction */
.../* misc cases */
/* if a sector failed before, need to stay failed */
if (check_sector(conf, bio->bi_sector, bio->bi_sector +
(bio->bi_size >> 9), WRITE))
failit = 1;
/* if the period (some predefined constant) is reached
for a sector, record the sector and fail it */
if (check_mode(conf, WritePersistent)) {
add_sector(conf, bio->bi_sector, WritePersistent);
failit = 1;
faulty.c
} else { /* failure cases for reads */
...
}
if (failit) {
struct bio *b = bio_clone(bio, GFP_NOIO);
b->bi_bdev = conf->rdev->bdev;
b->bi_private = bio;
b->bi_end_io = faulty_fail;
To the queue of this
generic_make_request(b);
device, initialized in
md.c from the disk
return 0;
device inode
} else {
bio->bi_bdev = conf->rdev->bdev;
return 1;
Make bio point to the actual
}
}
device, and let the main block
layer submit the IO and
resolve the recursion
faulty.c
static int faulty_fail(struct bio *bio, int error) {
struct bio *b = bio->bi_private;
b->bi_size = bio->bi_size;
b->bi_sector = bio->bi_sector;
bio_put(bio);
bio_io_error(b);
}
blk-core.c

A file system eventually calls
__generic_make_request()
static inline void __generic_make_request(struct bio *bio) {
...
do {
...
q = bdev_get_queue(bio->bi_bdev);
.../* check errors */
ret = q->make_request_fn(q, bio);
} while (ret);
}
linear.c
static int __init linear_init(void) {
return register_md_personality(&linear_personality);
}
static void linear_exit (void) {
unregister_md_personality(&linear_personality);
}
module_init(linear_init);
module_exit(linear_exit);
linear.c
static struct mdk_personality linear_personality = {
.name
= "linear",
.level
= LEVEL_LINEAR,
.owner
= THIS_MODULE,
.make_request
= linear_make_request,
.run
= linear_run,
.stop
= linear_stop,
.status
= linear_status, /* for proc */
.hot_add_disk
= linear_add,
.size
= linear_size,
};
linear.c
static int linear_run(mddev_t *mddev) {
linear_conf_t *conf;
initialize
conf->disks[i].end_sector
/* initialize
conf = linear_conf(mddev, mddev->raid_disks);
if (!conf) return 1;
mddev->private = conf;
md_set_array_sectors(mddev, conf->array_sectors;
...
typedef struct linear_private_data {
sector_t array_sectors;
dev_info_t disks[0];
struct rcu_head rcu;
} linear_conf_t;
linear.c
...
/* determines whether two bio can be merged */
/* overrides the default merge_bvec function */
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
/* queues are first plugged to build up the queue length,
then unplugged to release requests to devices */
mddev->queue->unplug_fn = linear_unplug;
/* disable prefetching when the device is congested */
mddev->queue->backing_dev_info.congested_fn
= linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
md_integrity_register(mddev);
return 0;
}
linear.c
static int linear_stop(mddev_t *mddev) {
linear_conf_t *conf = mddev->private;
/* the unplug fn references 'conf‘ */
rcu_barrier();
blk_sync_queue(mddev->queue);
kfree(conf);
return 0;
}
linear.c
static int linear_make_request(request_queue_t *q,
struct bio *bio) {
const int rw = bio_data_dir(bio);
mddev_t *mddev = q->queuedata;
dev_info_t *tmp_dev;
sector_t start_sector;
.../* check for errors and update statistics */
rcu_read_lock();
tmp_dev = which_dev(mddev, bio->bi_sector);
start_sector = tmp_dev->end_sector – tmp_dev->rdev->sectors;
.../* more error checks */
linear.c
if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
tmp_dev->end_sector)) {
/* This bio crosses a device boundary, so we have to
* split it. */
struct bio_pair *bp;
sector_t end_sector = tmp_dev->end_sector;
rcu_read_unlock();
bp = bio_split(bio, end_sector – bio->bi_sector);
if (linear_make_request(q, &bp->bio1)) /* recursion!?# */
generic_make_request(&bp->bio1);
if (linear_make_request(q, &bp->bio2)) /* recursion#!% */
generic_make_request(&bp->bio2);
bio_pair_release(bp); /* remove bio hazard */
return 0;
}
linear.c
Points to the specific device
instead of the linear device
bio->bi_bdev = tmp_dev->rdev->bdev;
bio->bi_sector = bio->bi_sector – start_sector +
tmp_dev->rdev->data_offset;
rcu_read_unlock();
return 1;
}
Again, let the main block layer
submit the IO and resolve the
recursion
Translates the virtual sector
number to the physical sector
number for the specific device
Download