Index: linux-2.6.17-rc4/drivers/md/dm-userspace.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.17-rc4/drivers/md/dm-userspace.c 2006-06-15 21:59:04.000000000 +0100 @@ -0,0 +1,1612 @@ +/* + * Copyright (C) International Business Machines Corp., 2006 + * Author: Dan Smith + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dm.h" +#include "dm-bio-list.h" +#include "kcopyd.h" + +#define DMU_DEBUG 0 + +#define DMU_COPY_PAGES 256 +#define DMU_KEY_LEN 256 + +#define DMU_PREFIX "dm-userspace: " +#define DMU_SET_ERROR(ti, msg) ti->error = DMU_PREFIX msg + +#if DMU_DEBUG +#define DPRINTK( s, arg... ) printk(DMU_PREFIX s, ##arg) +#else +#define DPRINTK( s, arg... ) +#endif + +kmem_cache_t *request_cache; +kmem_cache_t *remap_cache; + +static int enable_watchdog = 0; +static struct work_struct wd; + +static spinlock_t devices_lock; +static LIST_HEAD(devices); + +/* Device number for the control device */ +static dev_t our_dev; + +struct target_device { + struct list_head list; + struct block_device *bdev; + struct kref users; +}; + +struct hash_table { + struct list_head *table; + uint64_t size; + uint32_t mask; + uint64_t count; +}; + +/* A dm-userspace device, which consists of multiple targets sharing a + * common key + */ +struct dmu_device { + spinlock_t lock; + struct list_head list; + struct list_head requests; + struct list_head target_devs; + + struct hash_table remaps; + + struct cdev cdev; + dev_t ctl_dev; + + char key[DMU_KEY_LEN]; + struct kref users; + + wait_queue_head_t wqueue; + + uint64_t block_size; + uint64_t block_mask; + unsigned int block_shift; + + struct kcopyd_client *kcopyd_client; + + /* + * Count of the number of outstanding requests that have been + * made against this device, but have not yet been flushed + */ + atomic_t remap_ct; + + uint32_t id_counter; +}; + +struct userspace_request { + spinlock_t lock; + struct list_head list; + struct dmu_device *dev; + int type; + int sent; + uint32_t flags; + uint32_t id; + union { + struct bio_list bios; + uint64_t block; + } u; + atomic_t refcnt; +}; + +struct dmu_map { + spinlock_t lock; + uint64_t org_block; /* Original block */ + uint64_t new_block; /* Destination block */ + int64_t offset; + uint32_t flags; + struct target_device *src; + struct target_device *dest; + struct bio_list bios; + struct list_head list; + struct dmu_device *dev; + + struct dmu_map *next; /* Next remap that is dependent on this one */ +}; + +/* Forward delcarations */ +static struct file_operations ctl_fops; +static void copy_block(struct dmu_map *remap); +static void remap_flusher(struct dmu_map *remap); + +/* + * Return the block number for @sector + */ +static inline u64 dmu_block(struct dmu_device *dev, + sector_t sector) +{ + return sector >> dev->block_shift; +} + +/* + * Return the sector offset in a block for @sector + */ +static inline u64 dmu_sector_offset(struct dmu_device *dev, + sector_t sector) +{ + return sector & dev->block_mask; +} + +/* + * Return the starting sector for @block + */ +static inline u64 dmu_sector(struct dmu_device *dev, + uint64_t block) +{ + return block << dev->block_shift; +} + +static void copy_or_flush(struct dmu_map *remap) +{ + int copy; + + spin_lock(&remap->lock); + copy = dmu_get_flag(&remap->flags, DMU_FLAG_COPY_FIRST); + spin_unlock(&remap->lock); + + if (copy) + copy_block(remap); + else + remap_flusher(remap); +} + +static void error_bios(struct bio_list *bios) +{ + struct bio *bio; + int count = 0; + + while ((bio = bio_list_pop(bios)) != NULL) { + bio_io_error(bio, bio->bi_size); + count++; + } + + if (count) + printk(KERN_ERR DMU_PREFIX + "*** Failed %i requests\n", count); +} + +static void init_remap(struct dmu_device *dev, struct dmu_map *remap) +{ + spin_lock_init(&remap->lock); + remap->org_block = remap->new_block = 0; + remap->offset = 0; + remap->flags = 0; + remap->src = remap->dest = NULL; + bio_list_init(&remap->bios); + INIT_LIST_HEAD(&remap->list); + remap->dev = dev; + remap->next = NULL; +} + +static void init_request(struct dmu_device *dev, + int type, + struct userspace_request *req) +{ + spin_lock_init(&req->lock); + INIT_LIST_HEAD(&req->list); + req->dev = dev; + req->type = type; + req->sent = 0; + req->flags = 0; + if (type == DM_USERSPACE_COPY_FINISHED) { + req->u.block = 0; + req->id = 0; + } else { + bio_list_init(&req->u.bios); + spin_lock(&dev->lock); + dev->id_counter++; + if (dev->id_counter == 0) + dev->id_counter = 1; + req->id = dev->id_counter; + spin_unlock(&dev->lock); + } + atomic_set(&req->refcnt, 0); +} + +static void destroy_remap(struct dmu_map *remap) +{ + error_bios(&remap->bios); +} + +/* + * For an even block distribution, this is not too bad, but it could + * probably be better + */ +static uint32_t ht_hash(struct hash_table *ht, uint64_t block) +{ + return (uint32_t)block & ht->mask; +} + +static int ht_init(struct hash_table *ht, unsigned long size) +{ + uint64_t i; + unsigned long pages; + unsigned int order = ffs((size * sizeof(struct list_head *)) / + PAGE_SIZE); + + DPRINTK("Going to allocate 2^%u pages for %lu-entry table\n", + order, size); + + pages = __get_free_pages(GFP_ATOMIC, order); + if (!pages) { + DPRINTK("Failed to allocate hash table (%lu)\n", size); + return 0; + } + + ht->table = (void *)pages; + ht->size = size; + ht->count = 0; + ht->mask = size - 1; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&ht->table[i]); + + return 1; +} + +static void ht_insert_bucket(struct dmu_map *map, struct list_head *list) +{ + list_add_tail(&map->list, list); +} + +/* + * I'm sure this is quite dumb, but it works for now + */ +static int ht_should_grow(struct hash_table *ht) +{ + return ht->count > (2 * (ht->size / 4)); +} + +static void ht_grow_table(struct hash_table *ht); +static void ht_insert_map(struct hash_table *ht, struct dmu_map *map) +{ + uint32_t addr; + + addr = ht_hash(ht, map->org_block) & ht->mask; + + BUG_ON(addr >= ht->size); + + ht_insert_bucket(map, &ht->table[addr]); + ht->count++; + + if (ht_should_grow(ht)) + ht_grow_table(ht); +} + +static void ht_delete_map(struct hash_table *ht, struct dmu_map *map) +{ + list_del(&map->list); + BUG_ON(ht->count == 0); + ht->count--; +} + +static struct dmu_map *ht_find_map(struct hash_table *ht, uint64_t block) +{ + uint32_t addr; + struct dmu_map *m; + + addr = ht_hash(ht, block) & ht->mask; + + BUG_ON(addr >= ht->size); + + list_for_each_entry(m, &ht->table[addr], list) { + if (m->org_block == block) + return m; + } + + return NULL; +} + +static struct dmu_map *ht_find_map_dev(struct dmu_device *dev, uint64_t block) +{ + struct dmu_map *remap; + + spin_lock(&dev->lock); + + remap = ht_find_map(&dev->remaps, block); + + spin_unlock(&dev->lock); + + return remap; +} + +static void ht_grow_table(struct hash_table *ht) +{ + struct hash_table old_table; + uint64_t i; + + old_table = *ht; + + if (!ht_init(ht, old_table.size * 2)) { + DPRINTK("Can't grow table to %llu\n", + old_table.size * 2); + return; + } + + DPRINTK("Growing from %llu to %llu\n", + old_table.size, ht->size); + + for (i = 0; i < old_table.size; i++ ) { + struct dmu_map *m, *n; + list_for_each_entry_safe(m, n, &old_table.table[i], + list) { + list_del_init(&m->list); + ht_insert_map(ht, m); + } + } + + free_pages((unsigned long)old_table.table, + ffs((old_table.size * sizeof(struct list_head *)) + / PAGE_SIZE)); +} + +static uint64_t ht_destroy_table(struct hash_table *ht) +{ + uint64_t i, count = 0; + struct dmu_map *m, *n; + + for (i = 0; i < ht->size; i++) { + list_for_each_entry_safe(m, n, &ht->table[i], list) { + ht_delete_map(ht, m); + kmem_cache_free(remap_cache, m); + count++; + } + } + + return count; +} + +static struct target_device *get_target(struct dmu_device *dev, + dev_t devno) +{ + + struct target_device *target; + struct block_device *bdev; + + spin_lock(&dev->lock); + list_for_each_entry(target, &dev->target_devs, list) { + if (target->bdev->bd_dev == devno) { + spin_unlock(&dev->lock); + goto out; + } + } + spin_unlock(&dev->lock); + + bdev = open_by_devnum(devno, FMODE_READ | FMODE_WRITE); + if (IS_ERR(bdev)) { + printk(KERN_ERR DMU_PREFIX "Unable to lookup device %x\n", + devno); + return NULL; + } + + target = kmalloc(sizeof(*target), GFP_KERNEL); + if (!target) { + printk(KERN_ERR DMU_PREFIX + "Unable to alloc new target device\n"); + return NULL; + } + + target->bdev = bdev; + INIT_LIST_HEAD(&target->list); + + spin_lock(&dev->lock); + list_add_tail(&target->list, &dev->target_devs); + spin_unlock(&dev->lock); + + out: + return target; +} + +/* Caller must hold dev->lock */ +static void put_target(struct dmu_device *dev, + struct target_device *target) +{ + list_del(&target->list); + + bd_release(target->bdev); + blkdev_put(target->bdev); + + kfree(target); +} + +/* + * Add a request to the device's request queue + */ +static void add_request(struct dmu_device *dev, + struct userspace_request *req) +{ + spin_lock(&dev->lock); + list_add_tail(&req->list, &dev->requests); + spin_unlock(&dev->lock); + + wake_up(&dev->wqueue); +} + +/* + * + */ +static int have_pending_requests(struct dmu_device *dev) +{ + struct userspace_request *req; + int ret = 0; + + spin_lock(&dev->lock); + + list_for_each_entry(req, &dev->requests, list) { + if (!req->sent) { + ret = 1; + break; + } + } + + spin_unlock(&dev->lock); + + return ret; +} + +/* + * This periodically dumps out some debug information. It's really + * only useful while developing. + */ +static void watchdog(void *data) +{ + unsigned int v_remaps, i_remaps, reqs, s_reqs, devs = 0; + struct dmu_device *dev; + struct dmu_map *map; + struct userspace_request *req; + uint64_t i; + + spin_lock(&devices_lock); + + list_for_each_entry(dev, &devices, list) { + spin_lock(&dev->lock); + + v_remaps = i_remaps = reqs = s_reqs = 0; + + for (i = 0; i < dev->remaps.size; i++) { + list_for_each_entry(map, &dev->remaps.table[i], list) + if (dmu_get_flag(&map->flags, DMU_FLAG_VALID)) + v_remaps++; + else + i_remaps++; + } + + list_for_each_entry(req, &dev->requests, list) + if (req->sent) + s_reqs++; + else + reqs++; + + printk("Device %x:%x: " + " reqs: %u/%u " + " inv maps: %u " + " val maps: %u (%i)\n", + MAJOR(dev->ctl_dev), MINOR(dev->ctl_dev), + reqs, s_reqs, i_remaps, v_remaps, + atomic_read(&dev->remap_ct)); + devs++; + + spin_unlock(&dev->lock); + } + + spin_unlock(&devices_lock); + + schedule_delayed_work(&wd, HZ); +} + +static void __bio_remap(struct bio *bio, + struct dmu_map *remap) +{ + BUG_ON(remap->dest == NULL); + + bio->bi_sector = dmu_sector(remap->dev, remap->new_block) + + dmu_sector_offset(remap->dev, bio->bi_sector) + + remap->offset; + + bio->bi_bdev = remap->dest->bdev; +} + +/* + Pop, remap, and flush a bio. Set VALID flag if no bios + available +*/ +static struct bio *pop_and_remap(struct dmu_map *remap) +{ + struct bio *bio = NULL; + + spin_lock(&remap->lock); + + bio = bio_list_pop(&remap->bios); + if (bio) + __bio_remap(bio, remap); + else { + /* If there are no more bios, we must set the VALID + flag before we release the lock */ + dmu_set_flag(&remap->flags, DMU_FLAG_VALID); + } + + spin_unlock(&remap->lock); + + return bio; +} + +static void get_remap_attrs(struct dmu_map *remap, + int *copy_first, + int *temporary, + struct dmu_map **next) +{ + spin_lock(&remap->lock); + + *copy_first = dmu_get_flag(&remap->flags, DMU_FLAG_COPY_FIRST); + *temporary = dmu_get_flag(&remap->flags, DMU_FLAG_TEMPORARY); + *next = remap->next; + remap->next = NULL; + + spin_unlock(&remap->lock); +} + +static void remap_flusher(struct dmu_map *remap) +{ + struct bio *bio; + struct userspace_request *req; + int copy_first = 0, temporary = 0; + struct dmu_map *next; + + DPRINTK("Flushing bios for block %llu:%llu\n", + remap->org_block, remap->new_block); + + while (1) { + + bio = pop_and_remap(remap); + + if (bio) + generic_make_request(bio); + else + break; + + atomic_dec(&remap->dev->remap_ct); + + DPRINTK("Flushed %llu:%llu (%u bytes)\n", + dmu_block(remap->dev, bio->bi_sector), + dmu_sector_offset(remap->dev, bio->bi_sector), + bio->bi_size); + } + + get_remap_attrs(remap, ©_first, &temporary, &next); + + if (next) + copy_or_flush(next); + + /* Notify userspace */ + if (copy_first) { + req = kmem_cache_alloc(request_cache, GFP_KERNEL); + if (!req) { + printk(KERN_ERR DMU_PREFIX + "Failed to allocate copy response\n"); + return; + } + + init_request(remap->dev, DM_USERSPACE_COPY_FINISHED, req); + + req->u.block = remap->org_block; + + add_request(remap->dev, req); + } + + if (temporary) { + destroy_remap(remap); + kmem_cache_free(remap_cache, remap); + } +} + +static void destroy_dmu_device(struct kref *ref) +{ + struct dmu_device *dev; + struct list_head *cursor, *next; + uint64_t remaps; + + dev = container_of(ref, struct dmu_device, users); + + DPRINTK("Destroying device: %s\n", dev->key); + + spin_lock(&devices_lock); + list_del(&dev->list); + spin_unlock(&devices_lock); + + list_for_each_safe(cursor, next, &dev->target_devs) { + struct target_device *target; + + target = list_entry(cursor, + struct target_device, + list); + + put_target(dev, target); + } + + remaps = ht_destroy_table(&dev->remaps); + DPRINTK("Destroyed %llu/%llu remaps\n", remaps, dev->remaps.count); + + list_for_each_safe(cursor, next, &dev->requests) { + struct userspace_request *req; + + req = list_entry(cursor, + struct userspace_request, + list); + + list_del(&req->list); + + error_bios(&req->u.bios); + + kmem_cache_free(request_cache, req); + } + + kcopyd_client_destroy(dev->kcopyd_client); + + cdev_del(&dev->cdev); + kfree(dev); +} + +static inline void get_dev(struct dmu_device *dev) +{ + DPRINTK("get on %s\n", dev->key); + kref_get(&dev->users); +} + +static inline void put_dev(struct dmu_device *dev) +{ + DPRINTK("put on %s\n", dev->key); + kref_put(&dev->users, destroy_dmu_device); +} + +static int get_free_minor(void) +{ + struct dmu_device *dev; + int minor = 0; + + spin_lock(&devices_lock); + + list_for_each_entry(dev, &devices, list) { + if (MINOR(dev->ctl_dev) != minor) + break; + minor++; + } + + spin_unlock(&devices_lock); + + return minor; +} + +static int init_dmu_device(struct dmu_device *dev, u32 block_size) +{ + int ret; + + cdev_init(&dev->cdev, &ctl_fops); + dev->cdev.owner = THIS_MODULE; + dev->cdev.ops = &ctl_fops; + + init_waitqueue_head(&dev->wqueue); + INIT_LIST_HEAD(&dev->list); + INIT_LIST_HEAD(&dev->requests); + INIT_LIST_HEAD(&dev->target_devs); + kref_init(&dev->users); + spin_lock_init(&dev->lock); + + atomic_set(&dev->remap_ct, 0); + dev->id_counter = 1; /* reserve 0 for unsolicited maps */ + + if (!ht_init(&dev->remaps, 2048)) { + printk(KERN_ERR DMU_PREFIX + "Unable to allocate hash table\n"); + return 0; + } + + dev->block_size = block_size; + dev->block_mask = block_size - 1; + dev->block_shift = ffs(block_size) - 1; + + ret = kcopyd_client_create(DMU_COPY_PAGES, &dev->kcopyd_client); + if (ret) { + printk(DMU_PREFIX "Failed to initialize kcopyd client\n"); + return 0; + } + + return 1; +} + +static struct dmu_device *new_dmu_device(char *key, + struct dm_target *ti, + u32 block_size) +{ + struct dmu_device *dev, *ptr; + int ret; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (dev == NULL) { + printk(DMU_PREFIX "Failed to allocate new userspace device\n"); + return NULL; + } + + if (!init_dmu_device(dev, block_size)) + goto bad1; + + snprintf(dev->key, DMU_KEY_LEN, "%s", key); + + DPRINTK("New device with size %llu mask 0x%llX shift %u\n", + dev->block_size, dev->block_mask, dev->block_shift); + + dev->ctl_dev = MKDEV(MAJOR(our_dev), get_free_minor()); + + ret = cdev_add(&dev->cdev, dev->ctl_dev, 1); + if (ret < 0) { + printk(DMU_PREFIX "Failed to register control device %d:%d\n", + MAJOR(dev->ctl_dev), MINOR(dev->ctl_dev)); + goto bad2; + } + + DPRINTK("Registered new control interface: %i:%i\n", + MAJOR(dev->ctl_dev), MINOR(dev->ctl_dev)); + + spin_lock(&devices_lock); + if (list_empty(&devices)) + list_add(&dev->list, &devices); + else + list_for_each_entry(ptr, &devices, list) + if (MINOR(ptr->ctl_dev) < MINOR(dev->ctl_dev)) + list_add(&dev->list, &ptr->list); + spin_unlock(&devices_lock); + + return dev; + + bad2: + cdev_del(&dev->cdev); + bad1: + kfree(dev); + printk(KERN_ERR DMU_PREFIX "Failed to create device\n"); + return NULL; +} + +static struct dmu_device *find_dmu_device(const char *key) +{ + struct dmu_device *dev; + struct dmu_device *match = NULL; + + spin_lock(&devices_lock); + + list_for_each_entry(dev, &devices, list) { + spin_lock(&dev->lock); + if (strncmp(dev->key, key, DMU_KEY_LEN) == 0) { + match = dev; + spin_unlock(&dev->lock); + break; + } + spin_unlock(&dev->lock); + } + + spin_unlock(&devices_lock); + + return match; +} + +static int dmu_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + uint64_t block_size; + struct dmu_device *dev; + char *device_key; + char *block_size_param; + + if (argc < 2) { + DMU_SET_ERROR(ti, "Invalid argument count"); + return -EINVAL; + } + + device_key = argv[0]; + block_size_param = argv[1]; + + block_size = simple_strtoul(block_size_param, NULL, 10) / 512; + + dev = find_dmu_device(device_key); + if (dev == NULL) { + dev = new_dmu_device(device_key, + ti, + block_size); + if (dev == NULL) { + DMU_SET_ERROR(ti, "Failed to create device"); + goto bad; + } + } else { + get_dev(dev); + } + + spin_lock(&dev->lock); + if (dev->block_size != block_size) { + DMU_SET_ERROR(ti, "Invalid block size"); + goto bad; + } + spin_unlock(&dev->lock); + + ti->private = dev; + ti->split_io = block_size; + + DPRINTK(" block-size: %llu sectors\n", dev->block_size); + DPRINTK(" block-shift: %u\n", dev->block_shift); + DPRINTK(" block-mask: %llx\n", dev->block_mask); + + return 0; + + bad: + if (dev) { + spin_unlock(&dev->lock); + put_dev(dev); + } + + return -EINVAL; +} + +static void dmu_dtr(struct dm_target *ti) +{ + struct dmu_device *dev = (struct dmu_device *) ti->private; + + put_dev(dev); + + DPRINTK("destroyed %d:%d\n", (int)ti->begin, (int)ti->len); +} + +/* Search @dev for an outstanding request for remapping @block */ +static struct userspace_request *find_existing_req(struct dmu_device *dev, + uint64_t block) +{ + struct userspace_request *req; + struct userspace_request *maybe = NULL; + + spin_lock(&dev->lock); + + list_for_each_entry(req, &dev->requests, list) { + if ((req->type == DM_USERSPACE_MAP_BLOCK) && + (dmu_block(dev, req->u.bios.head->bi_sector) == block)) { + if (maybe) { + atomic_dec(&maybe->refcnt); + } + maybe = req; + atomic_inc(&maybe->refcnt); + } + } + + spin_unlock(&dev->lock); + + return maybe; +} + +static int make_new_request(struct dmu_device *dev, struct bio *bio) +{ + struct userspace_request *req; + + req = kmem_cache_alloc(request_cache, GFP_KERNEL); + if (req == NULL) + goto bad; + + init_request(dev, DM_USERSPACE_MAP_BLOCK, req); + + dmu_set_flag(&req->flags, DMU_FLAG_RD); + if (bio_rw(bio)) + dmu_set_flag(&req->flags, DMU_FLAG_WR); + else + dmu_clr_flag(&req->flags, DMU_FLAG_WR); + bio_list_add(&req->u.bios, bio); + + add_request(dev, req); + + DPRINTK("Queued %s request for sector " SECTOR_FORMAT "\n", + dmu_get_flag(&req->flags, DMU_FLAG_WR) ? "write" : "read", + bio->bi_sector); + + return 0; + + bad: + printk(KERN_ERR DMU_PREFIX "Failed to queue bio!\n"); + return -1; +} + +static int dmu_map_remap_case(struct dmu_device *dev, + struct dmu_map *remap, + struct bio *bio) +{ + int ret = 0; + + spin_lock(&remap->lock); + + if (dmu_get_flag(&remap->flags, DMU_FLAG_WR) != bio_rw(bio)) { + ret = -1; + } else { + if (dmu_get_flag(&remap->flags, DMU_FLAG_VALID)) { + __bio_remap(bio, remap); + ret = 1; + atomic_dec(&dev->remap_ct); + } else { + bio_list_add(&remap->bios, bio); + } + } + + spin_unlock(&remap->lock); + + return ret; +} + +static int dmu_map_request_case(struct dmu_device *dev, + struct userspace_request *req, + struct bio *bio) +{ + int ret = 0; + int req_rw = dmu_get_flag(&req->flags, DMU_FLAG_WR); + + spin_lock(&req->lock); + + if (!req_rw && bio_rw(bio) && !req->sent) { + /* Convert to R/W and Queue */ + dmu_set_flag(&req->flags, DMU_FLAG_WR); + bio_list_add(&req->u.bios, bio); + } else if (!req_rw && bio_rw(bio) && req->sent) { + /* Can't convert, must re-request */ + ret = -1; + } else { + /* Queue */ + bio_list_add(&req->u.bios, bio); + } + + spin_unlock(&req->lock); + + return ret; +} + +static int dmu_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dmu_device *dev = (struct dmu_device *) ti->private; + struct dmu_map *remap; + struct userspace_request *req; + int ret = 0; + u64 block; + + atomic_inc(&dev->remap_ct); + + block = dmu_block(dev, bio->bi_sector); + + remap = ht_find_map_dev(dev, block); + if (remap) { + ret = dmu_map_remap_case(dev, remap, bio); + if (ret >= 0) + goto done; + } + + req = find_existing_req(dev, block); + if (req) { + ret = dmu_map_request_case(dev, req, bio); + atomic_dec(&req->refcnt); + if (ret >= 0) + goto done; + } + + ret = make_new_request(dev, bio); + + done: + return ret; +} + +static int dmu_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct dmu_device *dev = (struct dmu_device *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + snprintf(result, maxlen, "%x:%x\n", + MAJOR(dev->ctl_dev), + MINOR(dev->ctl_dev)); + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %llu", + dev->key, + dev->block_size * 512); + break; + } + + return 0; +} + +static struct target_type userspace_target = { + .name = "userspace", + .version = {0, 1, 0}, + .module = THIS_MODULE, + .ctr = dmu_ctr, + .dtr = dmu_dtr, + .map = dmu_map, + .status = dmu_status, +}; + +static int format_userspace_message(struct dmu_write *msg, + struct userspace_request *req) +{ + int ret = 1; + + spin_lock(&req->lock); + + if (req->sent) + goto out; + + msg->id = req->id; + msg->type = req->type; + dmu_cpy_flag(&msg->flags, req->flags, DMU_FLAG_RD); + dmu_cpy_flag(&msg->flags, req->flags, DMU_FLAG_WR); + + switch (msg->type) { + case DM_USERSPACE_MAP_BLOCK: + msg->org_block = dmu_block(req->dev, + req->u.bios.head->bi_sector); + DPRINTK("Asking userspace to map %llu (%c)\n", + msg->org_block, + dmu_get_flag(&msg->flags, DMU_FLAG_WR) ? 'W' : 'R'); + break; + + case DM_USERSPACE_COPY_FINISHED: + case DM_USERSPACE_INVAL_COMPLETE: + case DM_USERSPACE_INVAL_FAILED: + msg->org_block = req->u.block; + break; + + default: + printk(KERN_INFO DMU_PREFIX + "Unknown message type %i\n", msg->type); + ret = 0; + } + + req->sent = 1; + + out: + spin_unlock(&req->lock); + + if (msg->type != DM_USERSPACE_MAP_BLOCK) { + /* COPY_FINISHED, et al messages don't get responses, + * so we take them off the request queue here + */ + list_del(&req->list); + kmem_cache_free(request_cache, req); + } + + return ret; +} + +ssize_t dmu_ctl_read(struct file *file, char __user *buffer, + size_t size, loff_t *offset) +{ + + struct dmu_device *dev = (struct dmu_device *)file->private_data; + struct dmu_write *msg; + struct userspace_request *req = NULL; + struct userspace_request *next; + int ret = 0; + int num_reqs, req_idx = 0; + + num_reqs = size / sizeof(*msg); + + if (num_reqs == 0) + return -EINVAL; + + msg = kmalloc(num_reqs * sizeof(*msg), GFP_KERNEL); + if (!msg) { + printk(KERN_ERR DMU_PREFIX + "Failed to alloc %i reqs!\n", num_reqs); + return -ENOMEM; + } + + while (!have_pending_requests(dev)) { + if (file->f_flags & O_NONBLOCK) { + return 0; + } + + if (wait_event_interruptible(dev->wqueue, + have_pending_requests(dev))) + return -ERESTARTSYS; + } + + spin_lock(&dev->lock); + + list_for_each_entry_safe(req, next, &dev->requests, list) { + + if (!format_userspace_message(&msg[req_idx], req)) + continue; + + ret += sizeof(*msg); + if (++req_idx >= num_reqs) { + break; + } + } + + spin_unlock(&dev->lock); + + if (copy_to_user(buffer, msg, sizeof(*msg) * req_idx)) { + DPRINTK("control read copy_to_user failed!\n"); + ret = -EFAULT; + } + + kfree(msg); + + return ret; +} + +static void copy_callback(int read_err, + unsigned int write_err, + void *data) +{ + remap_flusher((struct dmu_map *)data); +} + +static void copy_block(struct dmu_map *remap) +{ + struct io_region src, dst; + struct kcopyd_client *client; + + spin_lock(&remap->lock); + + src.bdev = remap->src->bdev; + src.sector = remap->org_block << remap->dev->block_shift; + src.count = remap->dev->block_size; + + dst.bdev = remap->dest->bdev; + dst.sector = (remap->new_block << remap->dev->block_shift); + dst.sector += remap->offset; + dst.count = remap->dev->block_size; + + DPRINTK("Copying: " + SECTOR_FORMAT "(" SECTOR_FORMAT "):" SECTOR_FORMAT " -> " + SECTOR_FORMAT "(" SECTOR_FORMAT "):" SECTOR_FORMAT "\n", + remap->org_block, + src.sector, + src.count * 512, + remap->new_block, + dst.sector, + dst.count * 512); + + client = remap->dev->kcopyd_client; + + spin_unlock(&remap->lock); + + kcopyd_copy(client, &src, 1, &dst, 0, copy_callback, remap); +} + +static int remap_request(struct dmu_write *msg, + struct dmu_device *dev, + struct userspace_request *req) + +{ + struct dmu_map *remap = NULL, *parent = NULL; + struct target_device *s_dev = NULL, *d_dev = NULL; + int is_chained = 0; + struct bio_list bio_holder; + + if (dmu_get_flag(&msg->flags, DMU_FLAG_COPY_FIRST)) { + s_dev = get_target(dev, MKDEV(msg->src_maj, msg->src_min)); + if (!s_dev) { + printk(KERN_ERR DMU_PREFIX + "Failed to find src device %i:%i\n", + msg->src_maj, msg->src_min); + goto bad; + } + } + + d_dev = get_target(dev, MKDEV(msg->dest_maj, msg->dest_min)); + if (!d_dev) { + printk(KERN_ERR DMU_PREFIX "Failed to find dst device %i:%i\n", + msg->dest_maj, msg->dest_min); + goto bad; + } + + if (req) { + while (atomic_read(&req->refcnt) != 0) { + DPRINTK("Waiting for exclusive use of request\n"); + schedule(); + } + + spin_lock(&req->lock); + bio_holder = req->u.bios; + spin_unlock(&req->lock); + } else { + bio_list_init(&bio_holder); + } + + /* Allocate a new remap early (before grabbing locks), since + we will most likely need it */ + remap = kmem_cache_alloc(remap_cache, GFP_KERNEL); + if (!remap) { + printk(KERN_ERR DMU_PREFIX "Failed to alloc remap!"); + goto bad; + } + init_remap(dev, remap); + spin_lock(&remap->lock); + remap->org_block = msg->org_block; + + spin_lock(&dev->lock); + + /* Here, we insert the new remap into the table, and remove + the existing map, if present, all in one locked operation */ + + parent = ht_find_map(&dev->remaps, msg->org_block); + if (parent) { + + spin_lock(&parent->lock); + + if (!dmu_get_flag(&parent->flags, DMU_FLAG_VALID)) { + if (dmu_get_flag(&parent->flags, DMU_FLAG_WR) == + dmu_get_flag(&msg->flags, DMU_FLAG_WR) && + (parent->new_block == msg->new_block)) { + /* Perms match for this not-yet-valid remap, + so tag our bios on to it and bail */ + bio_list_merge(&parent->bios, + &bio_holder); + + spin_unlock(&parent->lock); + spin_unlock(&dev->lock); + kmem_cache_free(remap_cache, remap); + return 1; + } else { + /* Remove parent from remap table, and + chain our new remap to this one so + it will fire when parent goes + valid */ + list_del(&parent->list); + if (parent->next) { + DPRINTK("Parent already chained!\n"); + BUG(); + } + parent->next = remap; + dmu_set_flag(&parent->flags, + DMU_FLAG_TEMPORARY); + is_chained = 1; + } + } else { + /* Remove existing valid remap */ + list_del(&parent->list); + destroy_remap(parent); + kmem_cache_free(remap_cache, parent); + } + + spin_unlock(&parent->lock); + } + + ht_insert_map(&dev->remaps, remap); + + spin_unlock(&dev->lock); + + remap->new_block = msg->new_block; + remap->offset = msg->offset; + remap->src = s_dev; + remap->dest = d_dev; + remap->dev = dev; + + dmu_clr_flag(&remap->flags, DMU_FLAG_VALID); + dmu_cpy_flag(&remap->flags, msg->flags, DMU_FLAG_TEMPORARY); + dmu_cpy_flag(&remap->flags, msg->flags, DMU_FLAG_WR); + dmu_cpy_flag(&remap->flags, msg->flags, DMU_FLAG_RD); + dmu_cpy_flag(&remap->flags, msg->flags, DMU_FLAG_COPY_FIRST); + + remap->bios = bio_holder; + + spin_unlock(&remap->lock); + + if (! is_chained) + copy_or_flush(remap); + + return 1; + + bad: + printk(KERN_ERR DMU_PREFIX "Remap error: chaos may ensue\n"); + + return 0; +} + +static int invalidate_request(struct dmu_write *msg, + struct dmu_device *dev) +{ + struct dmu_map *remap; + struct userspace_request *req; + int ret = 1; + + remap = ht_find_map_dev(dev, msg->org_block); + if (!remap) + ret = 0; + else { + spin_lock(&dev->lock); + spin_lock(&remap->lock); + if (dmu_get_flag(&remap->flags, DMU_FLAG_VALID)) + ht_delete_map(&dev->remaps, remap); + else + ret = 0; + spin_unlock(&remap->lock); + spin_unlock(&dev->lock); + } + + req = kmem_cache_alloc(request_cache, GFP_KERNEL); + if (!req) { + printk(KERN_ERR DMU_PREFIX + "Failed to allocate request\n"); + return 0; + } + + if (ret) { + DPRINTK("Invalidated mapping for: %llu\n", + msg->org_block); + init_request(dev, DM_USERSPACE_INVAL_COMPLETE, req); + } else { + DPRINTK("Failed to invalidate mapping for: %llu\n", + msg->org_block); + init_request(dev, DM_USERSPACE_INVAL_FAILED, req); + } + + req->u.block = msg->org_block; + + add_request(dev, req); + + return ret; +} + +ssize_t dmu_ctl_write(struct file *file, const char __user *buffer, + size_t size, loff_t *offset) +{ + + struct dmu_device *dev = (struct dmu_device *)file->private_data; + struct dmu_write msg; + struct userspace_request *next; + struct userspace_request *req = NULL, *match = NULL; + int num_resp, resp_idx; + int ret = 0; + + num_resp = size / sizeof(struct dmu_write); + + if (num_resp == 0) + return -EINVAL; + + for (resp_idx = 0; resp_idx < num_resp; resp_idx++) { + if (copy_from_user(&msg, buffer+ret, sizeof(msg))) { + printk(DMU_PREFIX + "control_write copy_from_user failed!\n"); + ret = -EFAULT; + goto out; + } + + ret += sizeof(msg); + + match = NULL; + /* See if we have a pending request that matches this */ + spin_lock(&dev->lock); + list_for_each_entry_safe(req, next, &dev->requests, list) { + if ((req->type == DM_USERSPACE_MAP_BLOCK) && + (req->id == msg.id)) { + list_del(&req->list); + match = req; + break; + } + } + spin_unlock(&dev->lock); + + if (!match) + DPRINTK("Processing unsolicited request: %u\n", + msg.id); + + switch (msg.type) { + + case DM_USERSPACE_MAP_BLOCK: + DPRINTK("Got map: %llu -> %llu:%lli (%i:%i) [%c]\n", + msg.org_block, + msg.new_block, + msg.offset, + msg.dest_maj, + msg.dest_min, + dmu_get_flag(&msg.flags, DMU_FLAG_WR)?'W':'R'); + remap_request(&msg, dev, match); + break; + + case DM_USERSPACE_MAP_FAILED: + if (match) { + printk(KERN_EMERG DMU_PREFIX + "userspace reported " + "failure to map sector %lu\n", + (unsigned long) + match->u.bios.head->bi_sector); + + spin_lock(&match->lock); + error_bios(&match->u.bios); + spin_unlock(&match->lock); + } + break; + + case DM_USERSPACE_MAP_INVALIDATE: + invalidate_request(&msg, dev); + break; + + default: + printk(KERN_ERR DMU_PREFIX + "Unknown request type: %i\n", msg.type); + } + + if (match) + kmem_cache_free(request_cache, match); + } + out: + return ret; +} + +int dmu_ctl_open(struct inode *inode, struct file *file) +{ + struct dmu_device *dev; + + dev = container_of(inode->i_cdev, struct dmu_device, cdev); + + get_dev(dev); + + file->private_data = dev; + + return 0; +} + +int dmu_ctl_release(struct inode *inode, struct file *file) +{ + struct dmu_device *dev; + + dev = (struct dmu_device *)file->private_data; + + put_dev(dev); + + return 0; +} + +unsigned dmu_ctl_poll(struct file *file, poll_table *wait) +{ + struct dmu_device *dev; + unsigned mask = 0; + + dev = (struct dmu_device *)file->private_data; + + poll_wait(file, &dev->wqueue, wait); + + if (have_pending_requests(dev)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + +static struct file_operations ctl_fops = { + .open = dmu_ctl_open, + .release = dmu_ctl_release, + .read = dmu_ctl_read, + .write = dmu_ctl_write, + .poll = dmu_ctl_poll, + .owner = THIS_MODULE, +}; + +int __init dm_userspace_init(void) +{ + int r = dm_register_target(&userspace_target); + if (r < 0) { + DMERR(DMU_PREFIX "Register failed %d", r); + return 0; + } + + spin_lock_init(&devices_lock); + + if (enable_watchdog) { + INIT_WORK(&wd, watchdog, NULL); + schedule_delayed_work(&wd, HZ); + } + + request_cache = + kmem_cache_create("dm-userspace-requests", + sizeof(struct userspace_request), + __alignof__ (struct userspace_request), + 0, NULL, NULL); + if (!request_cache) { + DMERR(DMU_PREFIX "Failed to allocate request cache\n"); + goto bad; + } + + remap_cache = + kmem_cache_create("dm-userspace-remaps", + sizeof(struct dmu_map), + __alignof__ (struct dmu_map), + 0, NULL, NULL); + if (!remap_cache) { + DMERR(DMU_PREFIX "Failed to allocate remap cache\n"); + goto bad2; + } + + r = alloc_chrdev_region(&our_dev, 0, 10, "dm-userspace"); + if (r) { + DMERR(DMU_PREFIX "Failed to allocate chardev region\n"); + goto bad3; + } + + DPRINTK(DMU_PREFIX "Loaded (major %i)\n", MAJOR(our_dev)); + + return 1; + + bad3: + kmem_cache_destroy(remap_cache); + bad2: + kmem_cache_destroy(request_cache); + bad: + dm_unregister_target(&userspace_target); + return 0; + +} + +void __exit dm_userspace_exit(void) +{ + int r; + struct list_head *cursor, *next; + struct dmu_device *dev; + + DPRINTK(DMU_PREFIX "Unloading\n"); + + if (enable_watchdog) + if (!cancel_delayed_work(&wd)) + flush_scheduled_work(); + + spin_lock(&devices_lock); + + list_for_each_safe(cursor, next, &devices) { + dev = list_entry(cursor, struct dmu_device, list); + list_del(cursor); + } + + spin_unlock(&devices_lock); + + unregister_chrdev_region(our_dev, 10); + + kmem_cache_destroy(request_cache); + kmem_cache_destroy(remap_cache); + + r = dm_unregister_target(&userspace_target); + if (r < 0) + DMERR(DMU_PREFIX "unregister failed %d", r); +} + +module_init(dm_userspace_init); +module_exit(dm_userspace_exit); + +module_param(enable_watchdog, int, S_IRUGO); + +MODULE_DESCRIPTION(DM_NAME " userspace target"); +MODULE_AUTHOR("Dan Smith"); +MODULE_LICENSE("GPL"); Index: linux-2.6.17-rc4/drivers/md/Kconfig =================================================================== --- linux-2.6.17-rc4.orig/drivers/md/Kconfig 2006-06-15 21:58:58.000000000 +0100 +++ linux-2.6.17-rc4/drivers/md/Kconfig 2006-06-15 21:59:04.000000000 +0100 @@ -237,6 +237,12 @@ config DM_SNAPSHOT ---help--- Allow volume managers to take writeable snapshots of a device. +config DM_USERSPACE + tristate "Userspace target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + A target that provides a userspace interface to device-mapper + config DM_MIRROR tristate "Mirror target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL Index: linux-2.6.17-rc4/drivers/md/Makefile =================================================================== --- linux-2.6.17-rc4.orig/drivers/md/Makefile 2006-06-15 21:58:58.000000000 +0100 +++ linux-2.6.17-rc4/drivers/md/Makefile 2006-06-15 21:59:04.000000000 +0100 @@ -37,6 +37,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_DM_USERSPACE) += dm-userspace.o quiet_cmd_unroll = UNROLL $@ cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ Index: linux-2.6.17-rc4/include/linux/dm-userspace.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.17-rc4/include/linux/dm-userspace.h 2006-06-15 21:59:04.000000000 +0100 @@ -0,0 +1,88 @@ +/* + * Copyright (C) International Business Machines Corp., 2006 + * Author: Dan Smith + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef __DM_USERSPACE_H +#define __DM_USERSPACE_H + +#ifdef __KERNEL__ +# include +#else +# include +#endif + +/* + * Message Types + */ +#define DM_USERSPACE_MAP_BLOCK 1 +#define DM_USERSPACE_MAP_FAILED 2 +#define DM_USERSPACE_MAP_INVALIDATE 3 +#define DM_USERSPACE_COPY_FINISHED 100 +#define DM_USERSPACE_INVAL_COMPLETE 101 +#define DM_USERSPACE_INVAL_FAILED 102 + +/* + * Flags and associated macros + */ +#define DMU_FLAG_VALID 1 +#define DMU_FLAG_RD 2 +#define DMU_FLAG_WR 4 +#define DMU_FLAG_COPY_FIRST 8 +#define DMU_FLAG_TEMPORARY 16 + +static int dmu_get_flag(uint32_t *flags, uint32_t flag) +{ + return (*flags & flag) != 0; +} + +static void dmu_set_flag(uint32_t *flags, uint32_t flag) +{ + *flags |= flag; +} + +static void dmu_clr_flag(uint32_t *flags, uint32_t flag) +{ + *flags &= (~flag); +} + +static void dmu_cpy_flag(uint32_t *flags, uint32_t src, uint32_t flag) +{ + *flags = (*flags & ~flag) | (src & flag); +} + +/* + * This is the message that is passed back and forth between the + * kernel and the user application + */ +struct dmu_write { + uint64_t org_block; /* Block that was accessed */ + uint64_t new_block; /* The new block it should go to */ + int64_t offset; /* Sector offset of the block, if needed */ + + uint32_t id; /* Unique ID for this request */ + uint32_t type; /* Type of request */ + uint32_t flags; /* Flags */ + + uint32_t src_maj; /* The source device for copying */ + uint32_t src_min; + + uint32_t dest_maj; /* Destination device for copying, and */ + uint32_t dest_min; /* for the block access */ +}; + +#endif